In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('avocado.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

    1. Column Unnamed can be dropped as it does not provide any value for the analysis.
    2. There may be some outliers present as there is wide gap between 75% and max values in most of the columns.
    3. We need to scale the dataset as range varies for each column.
    4. We can also drop date column.

In [None]:
df.columns

In [None]:
#We can see that more than 60% of the data in this column has value as 0, hence we can drop this column.
df['XLarge Bags'].value_counts()

# Data Visualization

In [None]:
df.head()

In [None]:
sns.distplot(df['AveragePrice'])

    Average price of Avocados ranges between 0.5 and 3.0

In [None]:
sns.countplot(df['type'])

      Two types of avocados are grown - conventional and oragnic.

In [None]:
sns.heatmap(df.isnull())

    No null values.

# Data Manipulation

In [None]:
df.drop('Unnamed: 0',axis=1,inplace=True)
df.drop('Date',axis=1,inplace=True)
df.drop('XLarge Bags',axis=1,inplace=True)


In [None]:
#Move AveragePrice column to the end :
df = df[[c for c in df if c not in ['AveragePrice']] + ['AveragePrice']] 

In [None]:
#Encode the columns :
from sklearn.preprocessing import LabelEncoder

cols = ['type','region']
for each in cols:
    encoder = LabelEncoder()
    df[each] = encoder.fit_transform(df[each])
df.info()

In [None]:
#Find correlation between variables :
corr = df.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr,annot=True)

    1. column type,year are positively related to target variable average price.
    2. Bag sizes and avocado types are positively related to target variable region.

In [None]:
#Check for skewness
col = df.columns.values
plt.figure(figsize=(20,20))
for i in range(0,len(col)):
    plt.subplot(11,5,i+1)
    sns.distplot(df[col[i]],color='red')
plt.show()

In [None]:
#Check for outliers :
plt.figure(figsize=(10,20))
for i in range(0,len(col)):
    plt.subplot(10,5,i+1)
    sns.boxplot(df[col[i]],palette='rocket',orient='v')
    plt.tight_layout()

    Many outliers are present.

In [None]:
#Use z-score to remove outliers :
from scipy.stats import zscore
#data.info()
z=np.abs(zscore(df))
print(np.where(z>3))
#new df with no outliers.
df_new = df[(z<3).all(axis=1)]

In [None]:
print(df.shape,"\t",df_new.shape)

In [None]:
from sklearn.model_selection import train_test_split

#Features where AveragePrice is the target variable
x = df_new.drop('AveragePrice',axis=1)
y = df_new.iloc[:,-1:]

#Features where region is the target variable.
X = df_new.drop('region',axis=1)
Y = df_new.iloc[:,-2:-1]

In [None]:
#Check for skewness for both independent variables.
x.skew()

In [None]:
X.skew()

In [None]:
#To remove skewness :
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer('yeo-johnson')
X = pd.DataFrame(pt.fit_transform(X))
x = pd.DataFrame(pt.fit_transform(x))

In [None]:
x.skew()

In [None]:
X.skew()

# PART A - Predict Average Price

In [None]:
from sklearn.metrics import r2_score,mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression as LR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.linear_model import Lasso,Ridge,ElasticNet

In [None]:
model = [LR(),DTR(),KNR(),SVR(),Lasso(),Ridge(),ElasticNet()]

for i in range(len(model)):
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.20,random_state=40)
    model[i].fit(x_train,y_train)
    y_pred = model[i].predict(x_test)
    r2Score = r2_score(y_test,y_pred)
    
    print("*************************************************************************************")
    print(model[i])
    print("R2 Score : ",r2Score)
    print("Mean Absolute Error : " ,mean_absolute_error(y_test,y_pred))
    print("Mean Squared Error : " ,mean_squared_error(y_test,y_pred))
    print("Root Mean Squared Error : " ,np.sqrt(mean_squared_error(y_test,y_pred)))
    print("")
    print("*************************************************************************************")



    KNeighbor Regressors performed best with 84.87% accuracy.

In [None]:
from sklearn.model_selection import cross_val_score

cv_score = cross_val_score(KNR(),x,y,cv=5,scoring='r2')
print("*************************************************************************************")
print("Score for ",KNR()," : ")
print("Score : ", cv_score)
print("Mean : ", cv_score.mean())
print("Standard Deviation : ", cv_score.std())
print("*************************************************************************************")
print("")

In [None]:
#GridSearchCV
from sklearn.model_selection import GridSearchCV

parameters = {'n_neighbors':list(range(1,31)),'weights':['uniform','distance']}

gridsearch = GridSearchCV(KNR(),parameters,n_jobs=-1,pre_dispatch=2)
gridsearch.fit(x,y)
gridsearch.best_params_

In [None]:
def random_state_counter(model):
    max_r2_score=0
    for j in range(10,90):
        x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.20,random_state=j)
        reg = model
        reg.fit(x_train,y_train)
        y_pred = reg.predict(x_test)
        score = r2_score(y_test,y_pred)
        if score>max_r2_score:
            max_r2_score = score
            final_state = j
    return final_state

In [None]:
knr = KNR(n_neighbors=30,weights='distance')
state = random_state_counter(knr)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.20,random_state = state)
knr.fit(x_train,y_train)
score = knr.score(x_train,y_train)
print("Score for KNeighbors regression : ",score)
y_pred = knr.predict(x_test)
r2Score = r2_score(y_test,y_pred)
print("R2 Score for Linear Regression : ",r2Score)
print("Mean Squared Error : " ,mean_squared_error(y_test,y_pred))
print("Mean Absolute Error : " ,mean_absolute_error(y_test,y_pred))
print("Root Mean Squared Error : " ,np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
#Adaboost Regressor and RandomforestRegressor
from sklearn.ensemble import AdaBoostRegressor as ABR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.ensemble import RandomForestRegressor as RFR


In [None]:
ada = ABR(n_estimators=20,random_state=120)
gradient = GBR(n_estimators=20,random_state=13)
rfr = RFR(n_estimators=20,random_state=76)

boosting_model = [ada,gradient,rfr]

for i in range(len(boosting_model)):
    boost = boosting_model[i]
    boost.fit(x_train,y_train)
    pred = boost.predict(x_test)
    r2Score = r2_score(y_test,pred)
    print("-----------------------------------------------------------")
    print(boost)
    print("-----------------------------------------------------------")
    print("R2 Score : ", r2Score)
    print("\n")

    Out of all the models KNR performed best

In [None]:
knr.fit(x_train,y_train)
y_pred= knr.predict(x_test)
r2score = r2_score(y_test,y_pred)
print("R2 Score for KNeighbor Regressor : ",r2score)
print("Mean Absolute Error : " ,mean_absolute_error(y_test,y_pred))
print("Mean Squared Error : " ,mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error : " ,np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
from sklearn.externals import joblib

joblib.dump(knr,'KNRModel.obj')

knr_from_joblib = joblib.load('KNRModel.obj')

finalOutput = knr_from_joblib.predict(x_test)

In [None]:
#save final output to a csv file :
pd.DataFrame(finalOutput).to_csv("Avocado_Regressor_Output.csv")

# Part B : Find the Region 

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [None]:
def calBestRandomStateOf(model):
    max_score=0
    for i in range(40,100):
        x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=.20,random_state=i)
        model.fit(x_train,y_train)
        pred = model.predict(x_test)
        score = r2_score(y_test,pred)
        if score>max_score:
            max_score = score
            final_state = i

    return final_state

In [None]:
model = [DecisionTreeClassifier(),KNeighborsClassifier()]

for i in range(len(model)):
    state = calBestRandomStateOf(model[i])
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=.20,random_state=state)
    model[i].fit(x_train,y_train)
    score = model[i].score(x_train,y_train)
    y_pred = model[i].predict(x_test)
    accuracy = accuracy_score(y_test,y_pred)
    classificationReport = classification_report(y_test,y_pred)
    confusionMatrix = confusion_matrix(y_test,y_pred)
    
    print("*************************************************************************************")
    print("Random State : ",state)
    print("Score of ",model[i]," is : ", score)
    print("Accuracy : ",accuracy*100,"% ")
    #print("Classification Report : \n")
    #print(classificationReport)
    #print("Confusion Matrix :\n ")
    #print(confusionMatrix)
    print("*************************************************************************************")


In [None]:
#Cross Validation :
from sklearn.model_selection import cross_val_score

model = [DecisionTreeClassifier(),KNeighborsClassifier()]
for i in range(len(model)):
    cv_score = cross_val_score(model[i],X,Y,cv=4,scoring='accuracy')
    print("*************************************************************************************")
    print("Score for ",model[i]," : ")
    print("Score : ", cv_score)
    print("Mean : ", cv_score.mean())
    print("Standard Deviation : ", cv_score.std())
    print("*************************************************************************************")
    print("")

In [None]:
#GridSearchCV
from sklearn.model_selection import GridSearchCV

parameters = {'n_neighbors':list(range(1,31)),'weights':['uniform','distance']}

gridsearch = GridSearchCV(KNeighborsClassifier(),parameters,n_jobs=-1,pre_dispatch=2)

gridsearch.fit(X,Y)
gridsearch.best_params_
                        

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

knc = KNeighborsClassifier(n_neighbors=3,weights='distance')
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.20,random_state = 85)
knc.fit(x_train,y_train)
score = knc.score(x_train,y_train)
print("Score for KNeighbors Classifier : ",score)
y_pred = knc.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy Score for KNC : ",accuracy)
print("Mean Squared Error : " ,mean_squared_error(y_test,y_pred))
print("Mean Absolute Error : " ,mean_absolute_error(y_test,y_pred))
print("Root Mean Squared Error : " ,np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
cv_score = cross_val_score(KNeighborsClassifier(),X,y,cv=4,scoring='accuracy')
print("*************************************************************************************")
print("Score for ",KNeighborsClassifier()," : ")
print("Score : ", cv_score)
print("Mean : ", cv_score.mean())
print("Standard Deviation : ", cv_score.std())
print("*************************************************************************************")
print("")

In [None]:
ada = ABC(n_estimators=20)
gradient = GBC(n_estimators=20)
rfc = RFC(n_estimators=20)


In [None]:
#Adaboost Regressor and RandomforestRegressor
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import RandomForestClassifier as RFC

ada = ABC(n_estimators=20,random_state=41)
gradient = GBC(n_estimators=20,random_state=162)
rfc = RFC(n_estimators=20,random_state=115)

boosting_model = [ada,gradient,rfc]

for i in range(len(boosting_model)):
    boost = boosting_model[i]
    boost.fit(x_train,y_train)
    pred = boost.predict(x_test)
    accuracyScore = accuracy_score(y_test,pred)
    print("-----------------------------------------------------------")
    print(boost)
    print("-----------------------------------------------------------")
    print("Accuracy Score : ", accuracyScore)
    print("\n")

In [None]:
#Final Model :
gradient = GBC(n_estimators=20,random_state=162)
gradient.fit(x_train,y_train)
y_pred= gradient.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy Score for SVC : ",accuracy)
print("Mean Absolute Error : " ,mean_absolute_error(y_test,y_pred))
print("Mean Squared Error : " ,mean_squared_error(y_test,y_pred))
print("Root Mean Squared Error : " ,np.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
from sklearn.externals import joblib

joblib.dump(gradient,'gradientmodel.obj')

gradient_from_joblib = joblib.load('gradientmodel.obj')

final_output = gradient_from_joblib.predict(x_test)

In [None]:
#save final output to a csv file :
pd.DataFrame(final_output).to_csv("Avocado_Classifier_Output.csv")