In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib
df= pd.read_csv("traindata.csv")
df["Date"] = pd.to_datetime(df["Date"]) 

# conventional avocado X regions X Year
# filter out all conventional avocado (type = conventional)
conventional_avo = df[df["type"].isin(['conventional'])]
# sort by average price
conventional_avo = conventional_avo.sort_values(by='AveragePrice')
# plot
organic_avo = df[df["type"].isin(['organic'])]
# sort by average price
organic_avo = organic_avo.sort_values(by='AveragePrice')

dummy_type = pd.get_dummies(df['type'])
# print sample
dummy_type.sample(2)
# concat
df = pd.concat([df, dummy_type], axis=1)
# print(df.sample(2))
df['region'] = df['region'].astype('category')
df.dtypes
df['region'] = df['region'].cat.codes
df['region'].sample(3)
df['Date_Q'] = df['Date'].apply(lambda x: x.quarter)
# set the size of the figure
# plt.figure(figsize=(22,12))
# set the title
# plt.title("Correlation Matrix")

coe_col = ['AveragePrice', 'Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 
           'year', 'organic', 'conventional', 'Date_Q', 'region']
cm = np.corrcoef(df[coe_col].values.T)
sns.set(font_scale = 1.7)
# ax = sns.heatmap(cm,cbar = True, annot = True,square = True, fmt = '.2f', annot_kws = {'size':15}, yticklabels = coe_col, 
#                  xticklabels = coe_col)

Using matplotlib backend: Qt5Agg


In [2]:
# built regression function
X_columns = ['Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'conventional', 'organic', 
             'Date_Q', 'year', 'region']
X = df[X_columns]
Y = df['AveragePrice']

print('X Shape:', X.shape)
print('Y Shape:', Y.shape)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=2019)
from sklearn.feature_selection import mutual_info_regression
dependencies = mutual_info_regression(X_train, y_train)
column_list = list(X_train.columns)
print('Mean among dependencies of X v.s. Y', np.mean(dependencies))

for i in range(len(dependencies)):
    if dependencies[i] > np.mean(dependencies):
        
        print('* ', column_list[i], dependencies[i])
    else:
        print(column_list[i], dependencies[i])


X Shape: (16325, 13)
Y Shape: (16325,)
Mean among dependencies of X v.s. Y 0.20760060413418027
*  Total Volume 0.32099960026672214
*  4046 0.3053367754425107
*  4225 0.25109557588224174
*  4770 0.21242663070414514
*  Total Bags 0.28059688261266125
*  Small Bags 0.22218195230935134
Large Bags 0.19344987501794808
XLarge Bags 0.10467647776271027
*  conventional 0.23477419986711157
*  organic 0.22467776628162595
Date_Q 0.03400738234400569
year 0.046554668878610705
*  region 0.26803006637469906


In [3]:


X_columns = ['Total Volume', '4046', '4225', '4770', 'Total Bags', 'Small Bags', 'Large Bags', 'XLarge Bags', 'conventional', 'organic', 
             'Date_Q', 'year', 'region']
X = df[X_columns]
Y = df['AveragePrice']
selected_features =X_columns
print('X Shape:', X.shape)
print('Y Shape:', Y.shape)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=2019)
from sklearn.feature_selection import mutual_info_regression
dependencies = mutual_info_regression(X_train, y_train)
column_list = list(X_train.columns)
print('Mean among dependencies of X v.s. Y', np.mean(dependencies))
for i in range(len(dependencies)):
    if dependencies[i] > np.mean(dependencies):
        print('* ', column_list[i], dependencies[i])
    else:
        print(column_list[i], dependencies[i])
        selected_features.remove(column_list[i])



X Shape: (16325, 13)
Y Shape: (16325,)
Mean among dependencies of X v.s. Y 0.20707884577410005
*  Total Volume 0.32003128866620134
*  4046 0.3059100979920837
*  4225 0.2486786963749017
*  4770 0.2083573758651487
*  Total Bags 0.28093452517577333
*  Small Bags 0.2218198230753865
Large Bags 0.19417244552975976
XLarge Bags 0.09907656133337062
*  conventional 0.2310526751007771
*  organic 0.23624486684439638
Date_Q 0.03352599101886833
year 0.04388716678747073
*  region 0.26833348129916246


In [5]:
X_train_sel = X_train[selected_features]
X_test_sel=X_test[selected_features]
import statsmodels.api as sm
from sklearn.metrics import explained_variance_score
model_2 = sm.OLS(y_train, X_train_sel)
res_2 = model_2.fit()
print(res_2.summary())

predictions = res_2.predict(X_test_sel)
print(explained_variance_score(predictions,y_test))


# Calculate R-squared
residuals = y_test - predictions
RMSE = np.sqrt(np.mean(residuals**2))
y_test_mean = np.mean(y_test)
tss =  np.sum((y_test - y_test_mean)**2 ) # total sum of square
rss =  np.sum(residuals**2) # sum of residuals
rsq  =  1 - (rss/tss)
print('Model2 features', rsq)


                            OLS Regression Results                            
Dep. Variable:           AveragePrice   R-squared:                       0.388
Model:                            OLS   Adj. R-squared:                  0.387
Method:                 Least Squares   F-statistic:                     865.8
Date:                 Mo, 24 Jun 2019   Prob (F-statistic):               0.00
Time:                        22:36:44   Log-Likelihood:                -3055.8
No. Observations:               10937   AIC:                             6130.
Df Residuals:                   10928   BIC:                             6195.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Total Volume -3.103e-05   7.47e-05     -0.415   