# <span style="color:red">**Please upvote this notebook if you find it useful!**</span>

# Loading Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
import warnings
warnings.filterwarnings("ignore")


%matplotlib inline
df=pd.read_csv("../input/av-janatahack-machine-learning-in-agriculture/train_yaOffsB.csv")

# Extraction of all kind of data from the .csv file

Getting the first 5 rows of the file

In [None]:
df.head()

Getting the last 10 rows

In [None]:
df.tail(10)

Total no. of rows and columns

In [None]:
df.shape

Index of all columns

In [None]:
df.columns

Datatypes of all columns

In [None]:
df.dtypes

It describes the columns by their count,mean,min,etc.(Only for continuous Variable)

In [None]:
df.describe()

It does the same but includes categorical variables

In [None]:
df.describe(include = 'all')

Shows the count of missing values if present

In [None]:
df.isnull().sum()

Here, there are 9000 missing values in Number_Weeks_Used column.
We need fill up those.

Creating a table which contains unique count and null count

In [None]:
temp = pd.DataFrame(index = df.columns)
temp['data_type']=df.dtypes
temp['null_count']=df.isnull().sum()
temp['unique_count']=df.nunique()
temp

pivot table for finding relation between pesticide use category and number of weeks of its use.

In [None]:
table = pd.pivot_table(data=df,index='Pesticide_Use_Category',values='Number_Weeks_Used',aggfunc=np.sum)
table

It is clear from above that Number of weeks used for '0' Pesticide_use_category is zero.

Pivot table for finding relation between soil type and number of weeks of its use.

In [None]:
table1 = pd.pivot_table(data=df,index='Soil_Type',values='Number_Weeks_Used',aggfunc=np.sum)
table1

Relation between soiltype and crop type

In [None]:
cross1 = pd.crosstab(df['Soil_Type'],df['Crop_Type'])
cross1


Relation between soil type and pesticide used category

In [None]:
cross2 = pd.crosstab(df['Soil_Type'],df['Pesticide_Use_Category'])
cross2

Finding the maximum no. weeks used in crop yield

In [None]:
maxi = df['Number_Weeks_Used'].mode()
maxi

# Missing Values

From the above, for Pesticide_Use_Category '0':- Number of weeks pesticide was used is zero.
And for other Pesticide_Use_Category :- Number of weeks pesticide was used is considered as 20(maximum no.)
 Thus,the empty cells are filled in the column.

In [None]:

for i in range(0,len(df)):
    if pd.isnull(df['Number_Weeks_Used'][i])==True:
        if (df['Pesticide_Use_Category'][i] == 1):
            df['Number_Weeks_Used'][i] = 0.0
        else:
            df['Number_Weeks_Used'][i] = 20.0

In [None]:
df['Number_Weeks_Used'].isnull().sum()

Confirming that our vales filled properly

In [None]:
df[['Number_Weeks_Used','Pesticide_Use_Category']].head(10)

In [None]:
df[['Number_Weeks_Used','Pesticide_Use_Category']].tail(10)

# Change of  required DataTypes

In [None]:
df=df.astype({'Number_Weeks_Used':'int64'})

In [None]:
df.dtypes

# Univariate analysis

In [None]:
df['Crop_Type'].value_counts().plot.bar()

More no. of crops are of Type '0'

In [None]:
df['Soil_Type'].value_counts().plot.bar()

More cropes are cultivated in soil type '0'.

In [None]:
df['Season'].value_counts().plot.bar()

Season 2 favours in more crop yield

In [None]:

df['Estimated_Insects_Count'].plot.box()

Presence of outliers above 3500 in Estimated_Insects_Count.

In [None]:
df['Number_Doses_Week'].plot.box()

Presence of outliers above 70 in Number_Doses_Week.

In [None]:
df['Number_Weeks_Used'].plot.box()

Presence of outliers above 60 in Number_Weeks_Used.

In [None]:
df['Number_Weeks_Quit'].plot.box()

Presence of outliers above 40 in Number_Weeks_Quit.

# Treating the presence of Outliers

Outliers are replaced by mean in all respective columns

In [None]:
df.loc[df['Estimated_Insects_Count']>3500,'Estimated_Insects_Count']=np.mean(df['Estimated_Insects_Count'])
df['Estimated_Insects_Count'].plot.box()

In [None]:
df.loc[df['Number_Doses_Week']>69,'Number_Doses_Week']=np.mean(df['Number_Doses_Week'])
df['Number_Doses_Week'].plot.box()

In [None]:
df.loc[df['Number_Weeks_Used']>60,'Number_Weeks_Used']=np.mean(df['Number_Weeks_Used'])
df['Number_Weeks_Used'].plot.box()

In [None]:
df.loc[df['Number_Weeks_Quit']>40,'Number_Weeks_Quit']=np.mean(df['Number_Weeks_Quit'])
df['Number_Weeks_Quit'].plot.box()

The datatypes are changed after outlier treatment. So,change to required data type

In [None]:
df.dtypes

In [None]:
df=df.astype({'Estimated_Insects_Count':'int64','Number_Doses_Week':'int64','Number_Weeks_Used':'int64','Number_Weeks_Quit':'int64'})

In [None]:
df.dtypes

# Bivariate Analysis

In [None]:
df.plot.scatter('Number_Weeks_Used','Number_Doses_Week')

The above scatter plot shows a linear relation between them.Correlation will tend to zero in this case

The below scatter plot takes shows crop_damage in colour.

In [None]:
fig, ax=plt.subplots()
colours={0:'green',1:'yellow',2:'purple'}
ax.scatter(df['Number_Weeks_Used'],df['Number_Doses_Week'],c=df['Crop_Damage'].apply(lambda x : colours[x]))
plt.xlabel('Number_Weeks_Used')
plt.ylabel('Number_Doses_Week')

plt.show()

In [None]:
cp=[df['Number_Weeks_Used'].corr(df['Number_Doses_Week'])]
cp

Correlation is zero. Its says the strength by value & dirction by sign.

In [None]:
grp=df.groupby('Pesticide_Use_Category')['Estimated_Insects_Count'].mean().plot.bar()


The above is the bar graph between Pesticide_Use_Category and Estimated_Insects_Count. The place where pesticides are not used have high count of insects than it is used.

In [None]:
df.pivot_table('Estimated_Insects_Count', ['Crop_Type', 'Soil_Type'], 'Season')

The relation between soil,crop,season together and estimated insect count.

In [None]:
df.pivot_table('Number_Weeks_Used', ['Crop_Type', 'Soil_Type'], 'Season')

The relation between soil,crop,season together and number of weeks pesticide used.

Adding another column with value 1 everywhere for a required table below.

In [None]:
df['count']=1


In [None]:
df.pivot_table('count', ['Season','Soil_Type','Crop_Type'], ['Pesticide_Use_Category'],aggfunc='sum')

In [None]:
df.pivot_table('count', ['Season','Soil_Type','Crop_Type'], ['Pesticide_Use_Category','Crop_Damage'],aggfunc='sum')

The above two tables shows the total relation among all less uniqueno. columns like soiltype,croptype,season,pesticide used and cropdamage. The value inside it represents the total count of crop of those similar types.

The below bar graphs represents the relation with target column and some other columns.

In [None]:
df.groupby('Crop_Damage')['Estimated_Insects_Count'].mean().plot.bar()

More Crops are damaged by insects.

In [None]:
df.groupby('Crop_Damage')['Number_Doses_Week'].mean().plot.bar()

In [None]:
df.groupby('Crop_Damage')['Number_Weeks_Used'].mean().plot.bar()

Excessive use of pesticides damaged the crop a lot.

In [None]:
df.groupby('Crop_Damage')['Number_Weeks_Quit'].mean().plot.bar()

The quiting of the use of pesticides mostly prevented crop damage.

# Feature Engineering

The below table adds total doses column in the data on the basics of use of pesticides doses per week and number of weeks of its use.

In [None]:
df['Total_Doses']=df['Number_Doses_Week']*df['Number_Weeks_Used']
df[['Total_Doses','Number_Weeks_Used','Number_Doses_Week']].tail()

The below table adds Total days column which is sum of no. of weeks of pesticide use and its quiting.

In [None]:
df['Total_Days']=df['Number_Weeks_Quit']+df['Number_Weeks_Used']
df[['Total_Days','Number_Weeks_Used','Number_Weeks_Quit']].tail()

In [None]:
df.groupby('Crop_Damage')['Total_Doses'].mean().plot.bar()

The above bar plot shows that crops are mostly damaged due to over doses of pesticides and even due to some other reasons.

In [None]:
df.groupby('Crop_Damage')['Total_Days'].mean().plot.bar()

The above bar plot shows a slight equilibrum in all 3 cases.

The below four crosstab shows the relation among crop damage,season,croptype,soiltype and pesticide used category. The values represent the count of crops.

In [None]:
pd.crosstab(df['Crop_Damage'],df['Season'])

In [None]:
pd.crosstab(df['Crop_Damage'],df['Crop_Type'])

In [None]:
pd.crosstab(df['Crop_Damage'],df['Soil_Type'])

In [None]:
pd.crosstab(df['Crop_Damage'],df['Pesticide_Use_Category'])

# MODEL BUILDING

All the columns except Crop Damage are stored under feature_table. And Crop Damage values are stored are in target_values.
StandardScaler is used from preprocessing of sklearn. It changes the values such that the standard deviation distribution from the mean equals one.Most values lies b/w -1 & 1.


feature_table values has been been transformed and fitted by standardscaler. 

In [None]:
feature_table = df[["Estimated_Insects_Count","Crop_Type","Soil_Type","Pesticide_Use_Category","Number_Doses_Week","Number_Weeks_Used","Number_Weeks_Quit","Season"]]

In [None]:
from sklearn import preprocessing
ft = preprocessing.StandardScaler().fit(feature_table)

In [None]:
feature_table = ft.transform(feature_table)
target_values = df["Crop_Damage"].values

K FOLD CROSS VALIDATION

Cross validation is a technique to evaluate predictive models by dividing the original data into training set to train the model and test set to evaluate it.The test set is considered as validation set here.
In K-Fold cross validation, the original sample is divided randomly into k equal subsamples.From which one subsample is considered as validation set and rest k-1 are taken as training set. The sample step is repeated for all the subsamples.
The advantages of this are :- It prevents overfitting. Reduced bias. Variance is reduced. Less Computation time.

ACCURACY

It is an evaluation metric for classification problem. It is the ratio between correct prediction and total prediction.
The increase in K value of K-fold cross validation increases the sample accuracy.

CROSS_VAL_SCORE

This is the way to implement kfold tenhnique on an estimator. It calculates the score for each cv split with the help of evaluation metrics.

In all the algorithm for predicting different models,K-fold technique is used and for evaluating it accuracy is used.
The value of k in all cases ais 30 with random state of 7.

LOGISTIC REGRESSION

Logistic Regression is a supervised algorithm.It is used in classification problems.It is a predictive analysis.The cost function of it is Log Loss.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
kfold = KFold(n_splits=30, random_state=7,shuffle=True)
scores=cross_val_score(lr, feature_table, target_values, scoring='accuracy', cv=kfold)
scores

In [None]:
scores.mean(),scores.std()

KNN

It is also known as Lazy Learning algorithm. Here,in this algorithm a new test instance is located in a training dataset plot.
Then,the distance of new test instance is calculated from all train data points . After that it is K distance is taken from sorted list.And the prediction of that instance is made.
For classification problem, mode is considered. The value of K is determined by Elbow method. Here, k=3

In [None]:
from sklearn.neighbors import KNeighborsClassifier as KNN
knn=KNN(n_neighbors=3)
scores1=cross_val_score(knn, feature_table, target_values, scoring='accuracy', cv=kfold)
scores1

In [None]:
scores1.mean(),scores1.std()

DESICION TREE

It is a surpervised ML algorithm.It uses the tree representation to solve the problems where leaf node corresponds to class label and internal node of tree corresponds to attributes.Its main objective is to have pure nodes. Gini imurity and information gain are the best methods to get best split points.

Gini impurity = 1-Gini ; Information Gain = 1-Entropy

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier('entropy',random_state=7)
scores2=cross_val_score(dt, feature_table, target_values, scoring='accuracy', cv=kfold)
scores2

In [None]:
scores2.mean(),scores2.std()

RANDOM FOREST

It is a supervised ML algorithm which creates desicion trees on data samples and predict from each of them. After that selects the best by votting. It reduces overfitting by averaging the result.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
scores3=cross_val_score(rf, feature_table, target_values, scoring='accuracy', cv=kfold)
scores3

In [None]:
scores3.mean(),scores3.std()

# Test Data Sample

In the following, the test dataset has been extracted and cleaned like train data set.

In [None]:
test=pd.read_csv("../input/av-janatahack-machine-learning-in-agriculture/test_pFkWwen.csv")

In [None]:
test.head()

In [None]:
test.isnull().sum()

In [None]:
for i in range(0,len(test)):
    if pd.isnull(test['Number_Weeks_Used'][i])==True:
        if (test['Pesticide_Use_Category'][i] == 1):
            test['Number_Weeks_Used'][i] = 0.0
        else:
            test['Number_Weeks_Used'][i] = 20.0

In [None]:
test['Number_Weeks_Used'].isnull().sum()

In [None]:
test.dtypes

In [None]:
test['count']=1

In [None]:
test['Estimated_Insects_Count'].plot.box()

In [None]:
test['Number_Doses_Week'].plot.box()

In [None]:
test['Number_Weeks_Used'].plot.box()

In [None]:
test['Number_Weeks_Quit'].plot.box()

In [None]:
test.loc[test['Estimated_Insects_Count']>3500,'Estimated_Insects_Count']=np.median(test['Estimated_Insects_Count'])
test['Estimated_Insects_Count'].plot.box()

In [None]:
test.loc[test['Number_Doses_Week']>69,'Number_Doses_Week']=np.mean(test['Number_Doses_Week'])
test['Number_Doses_Week'].plot.box()

In [None]:
test.loc[test['Number_Weeks_Used']>60,'Number_Weeks_Used']=np.mean(test['Number_Weeks_Used'])
test['Number_Weeks_Used'].plot.box()

In [None]:
test.loc[test['Number_Weeks_Quit']>40,'Number_Weeks_Quit']=np.mean(test['Number_Weeks_Quit'])
test['Number_Weeks_Quit'].plot.box()

In [None]:
test=test.astype({'Estimated_Insects_Count':'int64','Number_Doses_Week':'int64','Number_Weeks_Used':'int64','Number_Weeks_Quit':'int64'})

In [None]:
test.dtypes

In [None]:
test['Total_Doses']=test['Number_Doses_Week']*test['Number_Weeks_Used']
test[['Total_Doses','Number_Weeks_Used','Number_Doses_Week']].tail()

In [None]:
test['Total_Days']=test['Number_Weeks_Quit']+test['Number_Weeks_Used']
test[['Total_Days','Number_Weeks_Used','Number_Weeks_Quit']].tail()

# Prediction

The test columns are stored in test_features and the values are transformed by StandardScaler.

In [None]:
test_features=test[["Estimated_Insects_Count","Crop_Type","Soil_Type","Pesticide_Use_Category","Number_Doses_Week","Number_Weeks_Used","Number_Weeks_Quit","Season"]]

In [None]:
test_features = ft.transform(test_features)

VOTING CLASSIFIER

It is a wrapper for set of different algorithms that are trained and valuated in parallel in order to exploit the different peculiarities of each algorithm. It chooses the best predicting model for the dataset.It yields better performance than any single algorithm.

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
vc = VotingClassifier(estimators=[('lr',lr),('knn',knn),('dt',dt),('rf',rf)],voting='hard')
result=cross_val_score(vc, feature_table, target_values, scoring='accuracy', cv=kfold)
result

In [None]:
result.mean(),result.std()

Predicting the test target values i.e. Crop_Damage in test dataset

In [None]:
vc=vc.fit(feature_table,target_values)

In [None]:
test_Crop_Damage = vc.predict(test_features)

In [None]:
solution = pd.DataFrame(test_Crop_Damage)

In [None]:
id = test['ID']
sample_solution = pd.concat([id,solution],axis=1)
sample_solution.columns = ['ID','Crop_Damage']


The predicted dataset is saved in .csv file

In [None]:
sample_solution.to_csv('sample_solution.csv')

In [None]:
ss=pd.read_csv("sample_solution.csv")

In [None]:
ss.head()