In [None]:
#Necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels import api as sm

#Necessary Methods
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline

# Model Imports
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet,Lars,LassoLars,OrthogonalMatchingPursuit,BayesianRidge,SGDRegressor,PassiveAggressiveRegressor
from sklearn.svm import SVR,NuSVR,LinearSVR
from sklearn.neighbors import KNeighborsRegressor,RadiusNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor


In [None]:
!pip install pingouin
from pingouin import kruskal

In [None]:
df=pd.read_csv("../input/bigmart-sales-data/Train.csv")
df

In [None]:
df.isnull().sum()

In [None]:
df.Outlet_Identifier.unique()

In [None]:
gb=df.groupby('Outlet_Identifier')
for i in ['Outlet_Establishment_Year','Outlet_Size','Outlet_Location_Type','Outlet_Type']:
  print(f"{i} : {gb[i].unique()}")

In [None]:
df.Item_Weight.plot(kind='hist')

In [None]:
df.Item_Identifier.unique()

In [None]:
df.Item_Fat_Content.unique()

In [None]:
df.Item_Type.unique()

In [None]:
df.Item_Fat_Content.replace({'low fat':'Low Fat',
                             'LF':'Low Fat',
                             'reg':'Regular'},inplace=True)
df.Item_Fat_Content.unique()

In [None]:
print(kruskal(df,'Item_Weight','Item_Type'))
print(kruskal(df,'Item_Weight','Outlet_Identifier'))
print(kruskal(df,'Item_Weight','Item_Fat_Content'))

In [None]:
df.drop(columns=['Outlet_Establishment_Year','Outlet_Size','Outlet_Location_Type','Outlet_Type','Item_Identifier'],inplace=True)
df=df.groupby('Item_Type',group_keys=False).apply(lambda x:x.fillna(x.mean()))
df

In [None]:
df.plot(kind='box')

In [None]:
df.hist(figsize=(20,10),layout=(2,2))

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
outputFeature=['Item_Outlet_Sales']
y=df[outputFeature]
X=pd.get_dummies(df[df.columns.difference(outputFeature)],drop_first=True)

In [None]:
models=[LinearRegression(n_jobs=-1),
Ridge(),
Lasso(),
ElasticNet(),
Lars(),
LassoLars(),
OrthogonalMatchingPursuit(),
BayesianRidge(),
SGDRegressor(),
PassiveAggressiveRegressor(),
SVR(),
NuSVR(),
KNeighborsRegressor(n_jobs=-1),
GaussianProcessRegressor(),
DecisionTreeRegressor(),
RandomForestRegressor(n_jobs=-1),
AdaBoostRegressor(),
GradientBoostingRegressor()]


def BestRegressor(models):
    kf = KFold(n_splits=4,random_state=0,shuffle=True)
    results=[]
    for model in models:
        testMetricResults,trainMetricResults=[],[]
        for train_index, test_index in kf.split(X, y):
            X_train, X_test,y_train, y_test = X.iloc[train_index], X.iloc[test_index],y.iloc[train_index], y.iloc[test_index]
            pt=PowerTransformer()
            y_train=pt.fit_transform(y_train).ravel()
            y_test=pt.transform(y_test).ravel()
            pipeline=make_pipeline(PowerTransformer(),model)    
            pipeline.fit(X_train,y_train)
            testMetricResults.append(mean_squared_error(y_test, pipeline.predict(X_test), squared=False))
            trainMetricResults.append(mean_squared_error(y_train, pipeline.predict(X_train), squared=False))
        res=[type(model).__name__,np.mean(testMetricResults),np.mean(trainMetricResults)]
        results.append(res)
        print(res)
    return results

estimator=BestRegressor(models)

In [None]:
pd.DataFrame(estimator,columns=['Estimator','TestScore','TrainScore']).sort_values('TestScore')

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1)

ptX=PowerTransformer()
X_train=pd.DataFrame(ptX.fit_transform(X_train),columns=X_train.columns)
X_test=pd.DataFrame(ptX.transform(X_test),columns=X_test.columns)

ptY=PowerTransformer()
y_train=ptY.fit_transform(y_train)
y_test=ptY.transform(y_test)

In [None]:
X_train=sm.add_constant(X_train) 
X_test=sm.add_constant(X_test)
model=sm.OLS(y_train, X_train).fit()
print(model.summary())

In [None]:
print(f"Test : {mean_squared_error(y_test, model.predict(X_test), squared=False)}")
print(f"Train : {mean_squared_error(y_train, model.predict(X_train), squared=False)}")

In [None]:
def RemoveUnneccesaryFeatures(XTrain,yTrain):
      while(True): 
        lmodel=sm.OLS(yTrain, XTrain).fit()
        pv=lmodel.pvalues
        pv=pv[pv.index.difference(['const'])]
        try:
            column=pv[pv>.05].idxmax()
            print(f"Column : {column}\tValue : {pv[column]}")
            XTrain.drop(columns=[column],inplace=True)
        except:
            return lmodel,XTrain.columns

In [None]:
lModel,columns=RemoveUnneccesaryFeatures(X_train,y_train)

In [None]:
lModel.summary()

In [None]:
print(f"Test : {mean_squared_error(y_test, lModel.predict(X_test[columns]), squared=False)}")
print(f"Train : {mean_squared_error(y_train, lModel.predict(X_train[columns]), squared=False)}")

In [None]:
dfTest=pd.read_csv("../input/bigmart-sales-data/Test.csv")
dfTest

In [None]:
dfTest.isnull().sum()

In [None]:
dfTest.Outlet_Identifier.unique()

In [None]:
gb=dfTest.groupby('Outlet_Identifier')
for i in ['Outlet_Establishment_Year','Outlet_Size','Outlet_Location_Type','Outlet_Type']:
    print(f"{i} : {gb[i].unique()}")

In [None]:
dfTest.Item_Weight.plot(kind='hist')

In [None]:
dfTest.Item_Identifier.unique()

In [None]:
dfTest.Item_Fat_Content.unique()

In [None]:
dfTest.Item_Type.unique()

In [None]:
dfTest.Item_Fat_Content.replace({'low fat':'Low Fat',
                             'LF':'Low Fat',
                             'reg':'Regular'},inplace=True)
dfTest.Item_Fat_Content.unique()

In [None]:
dfTest.drop(columns=['Outlet_Establishment_Year','Outlet_Size','Outlet_Location_Type','Outlet_Type','Item_Identifier'],inplace=True)
dfTest=dfTest.groupby('Item_Type',group_keys=False).apply(lambda x:x.fillna(x.mean()))
dfTest

In [None]:
XTest=pd.get_dummies(dfTest,drop_first=True)
XTest=pd.DataFrame(ptX.transform(XTest),columns=XTest.columns)
XTest=sm.add_constant(XTest)
lModel.predict(XTest[columns])