In [None]:
# import libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

from scipy import stats

import warnings
warnings.filterwarnings("ignore")

In [None]:
# load the data

data = pd.read_csv("../input/avocado-prices/avocado.csv")

In [None]:
data.shape

In [None]:
data.head(3)

In [None]:
data.isnull().sum()/len(data.index)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
# lets observe the distinct values in categorical cols

data.type.value_counts()

In [None]:
# lets drop unnamed col and date

data = data.drop(['Unnamed: 0','Date'],axis=1)

In [None]:
# lets convert categorical variables into 0 and 1 

data['type'] =  data['type'].apply(lambda x: 1 if x == 'conventional' else 0)

In [None]:
labels1,levels1 = pd.factorize(data['region'])

In [None]:
data['region_num'] = pd.DataFrame(labels1)

In [None]:
data = data.drop(['region'],axis=1)

In [None]:
data.head(3)

In [None]:
data.columns

In [None]:
X = data.loc[:,['Total Volume', '4046', '4225', '4770', 'Total Bags',
       'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year',
       'region_num']]
y = data.AveragePrice

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.85,random_state=100)

In [None]:
model_lr01 = LinearRegression()
model_lr01.fit(X_train,y_train)

In [None]:
y_predict = model_lr01.predict(X_test)

In [None]:
print("r2 score of the model is {}".format(r2_score(y_true = y_test, y_pred = y_predict)))

In [None]:
# correlation

data.corr()

### Model 2

In [None]:
X = data.loc[:,['4046', '4225', '4770', 'Total Bags',
       'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year',
       'region_num']]
y = data.AveragePrice

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.85,random_state=100)

model_lr02 = LinearRegression()
model_lr02.fit(X_train,y_train)


In [None]:
y_predict = model_lr02.predict(X_test)
print("r2 score of the model is {}".format(r2_score(y_true = y_test, y_pred = y_predict)))

In [None]:
X = data.loc[:,['type', 'year',
       'region_num']]
y = data.AveragePrice

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.85,random_state=100)

model_lr03 = LinearRegression()
model_lr03.fit(X_train,y_train)

In [None]:
y_predict = model_lr03.predict(X_test)
print("r2 score of the model is {}".format(r2_score(y_true = y_test, y_pred = y_predict)))

### StatsModel

In [None]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
data.columns

In [None]:
X = data.loc[:,['Total Volume', '4046', '4225', '4770', 'Total Bags',
       'Small Bags', 'Large Bags', 'XLarge Bags', 'type', 'year',
       'region_num']]

y = data.AveragePrice

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.85,random_state=100)

model_s1 = sm.OLS(y_train,X_train).fit()

In [None]:
model_s1.summary()

In [None]:
vif = pd.DataFrame()

vif['Features'] =X_train.columns

vif['VIF'] = [variance_inflation_factor(X_train.values,i)for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'],2)
vif=vif.sort_values(by='VIF',ascending=False)
vif

In [None]:
y_predict = model_s1.predict(X_train)

In [None]:
y_train.head(2)

In [None]:
y_predict.head(2)