In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test_df =pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
sample_sub = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.describe()

In [None]:
train_df.isna().sum()

In [None]:
test_df.isna().sum()

In [None]:
train_df.isnull().sum()

In [None]:
train_df.info()

In [None]:
train_df['year'] = pd.to_datetime(train_df['date']).dt.year
train_df['week'] = pd.to_datetime(train_df['date']).dt.week
train_df['day'] = pd.to_datetime(train_df['date']).dt.day
train_df['weekday'] = pd.to_datetime(train_df['date']).dt.dayofweek

test_df['year'] = pd.to_datetime(test_df['date']).dt.year
test_df['week'] = pd.to_datetime(test_df['date']).dt.week
test_df['day'] = pd.to_datetime(test_df['date']).dt.day
test_df['weekday'] = pd.to_datetime(test_df['date']).dt.dayofweek

train_df.drop('date', axis=1, inplace=True)
test_df.drop('date', axis=1, inplace=True)
train_df.head()

In [None]:
print('Country Count\n',train_df['country'].value_counts())
print('-----------------------------------------------')
print('Store Count\n',train_df['store'].value_counts())
print('-----------------------------------------------')
print('Product Count\n',train_df['product'].value_counts())

In [None]:
row_id = test_df['row_id']
row_id = pd.DataFrame(row_id, columns=['row_id'])
row_id.head()

In [None]:
train_ohe = pd.get_dummies(train_df, columns = ['country', 'store', 'product'])
test_ohe = pd.get_dummies(test_df, columns = ['country', 'store', 'product'])
train_ohe.head()

In [None]:
train_ohe.drop('row_id', axis=1, inplace=True)
test_ohe.drop('row_id', axis=1, inplace=True)
train_ohe.head()

In [None]:
Xtrain = train_ohe
ytrain = train_df['num_sold']
Xtrain.head()

In [None]:
Xtrain.drop('num_sold', axis=1, inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xtrain, ytrain, test_size=.3, random_state=48)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from sklearn.svm import SVR

In [None]:
nrow, ncol = X_train.shape
print('No of Row: ',nrow)
print('No of Columns: ',ncol)

In [None]:
def scores(i):
    lin = i()
    lin.fit(X_train, y_train)
    y_pred = lin.predict(X_test)
    lin_r = r2_score(y_test, y_pred)
    s.append(lin_r)

    adj_r2_score = 1 - (((1-lin_r)*(nrow-1))/(nrow-1-ncol))
    s1.append(adj_r2_score)

    errors = abs(y_test - y_pred)
    err = (y_test + y_pred)/2
    smape = np.mean((errors/err)*100)
    
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    s2.append(accuracy)
    s3.append(mape)
    s4.append(smape)    

    MAE = np.abs(y_test - y_pred).mean()
    s5.append(MAE)

    MSE = ((y_test - y_pred)**2).mean()
    s6.append(MSE)

    RMSE = np.sqrt(((y_test - y_pred)**2).mean())
    s7.append(RMSE)

algos = [LinearRegression,KNeighborsRegressor, RandomForestRegressor, Lasso, ElasticNet, DecisionTreeRegressor, GradientBoostingRegressor, SVR]
s = []
s1 = []
s2 = []
s3 = []
s4 = []
s5 = []
s6 = []
s7 = []
for i in algos:
  scores(i)

In [None]:
models = pd.DataFrame({
    'Method': ['LinearRegression', 'KNeighborsRegressor', 'RandomForestRegressor', 'Lasso', 'ElasticNet',
               'DecisionTreeRegressor', 'GradientBoostingRegressor', 'SVR'],
    'r2 Scores' : [s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]],
    'Ajd r2 Score' : [s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], s1[7]],
    'Accuracy' : [s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], s2[7]],
    'MAPE' : [s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], s3[7]],
    'SMAPE' : [s4[0], s4[1], s4[2], s4[3], s4[4], s4[5], s4[6], s4[7]],
    'MAE' : [s5[0], s5[1], s5[2], s5[3], s5[4], s5[5], s5[6], s5[7]],
    'MSE' : [s6[0], s6[1], s6[2], s6[3], s6[4], s6[5], s6[6], s6[7]],
    'RMSE' : [s7[0], s7[1], s7[2], s7[3], s7[4], s7[5], s7[6], s7[7]]
})
models.sort_values(by='r2 Scores', ascending=False)

In [None]:
rfr_Model = RandomForestRegressor()
rfr_Model.fit(Xtrain, ytrain)
y_pred = rfr_Model.predict(X_test)
r2Score = r2_score(y_test, y_pred)
print('R2 Score',r2Score)         #0.9730024554109795,    0.973003845604028,    0.9781682423157221  
print('--------------------------------------------------------------')
nrow, ncol = Xtrain.shape 
x = 1-r2Score
y = nrow-1
z = nrow-1-ncol
adj_r2_score = 1 - ((x*y)/z)
print('Adjusted r2 Score',adj_r2_score)
print('--------------------------------------------------------------')
errors = abs(y_test - y_pred)
err = (y_test + y_pred)/2
smape = np.mean((errors/err)*100)
print('sMAPE',smape)
print('--------------------------------------------------------------')
errors = abs(y_test - y_pred)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape
print('Accuracy = {:0.2f}%.'.format(accuracy))
print('--------------------------------------------------------------')
print('MAE', np.abs(y_pred-y_test).mean())
print('--------------------------------------------------------------')
print('RMSE: ', np.sqrt(((y_test - y_pred)**2).mean()))
print('--------------------------------------------------------------')
print('MSE: ', ((y_test - y_pred)**2).mean())

In [None]:
df_test_preds = rfr_Model.predict(test_ohe)

num_sold = pd.DataFrame(df_test_preds, columns=['num_sold'])

df_result = pd.concat([row_id.reset_index(drop=True), num_sold.reset_index(drop=True)], axis=1)

df_result.to_csv('sample_submission.csv', index=False)

In [None]:
submit = pd.read_csv('./df_result.csv')
submit.head()

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit.head()

In [None]:
# Use PolynomialFeatures in sklearn.preprocesingto create two-way interaction for all features
from itertools import combinations
from sklearn.preprocessing import PolynomialFeatures

def add_interactions(df):

    # Get feature name
    combos = list(combinations(list(df.columns), 2))
    colnames = list(df.columns)+['_'.join(x) for x in combos]

    # Find Interactions
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    df = poly.fit_transform(df)
    df = pd.DataFrame(df)
    df.columns = colnames

    # Removing INteractions terms with all 0 values
    noint_indices = [i for i, x in enumerate(list((df==0).all())) if x]
    df = df.drop(df.columns[noint_indices], axis=1)

    return df

In [None]:
X = add_interactions(Xtrain)
test_ohe = add_interactions(test_ohe)
X.head()

In [None]:
print('X Shape',X.shape)
print('Xtrain Shape', Xtrain.shape)
print('y Shape', ytrain.shape)
print('test_ohe Shape', test_ohe.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, ytrain, test_size=.25, random_state=48)
nrow, ncol = X_train.shape
print('No of Row: ',nrow)
print('No of Columns: ',ncol)
X_test.shape

In [None]:
ytrain.shape

In [None]:
algos = [LinearRegression,KNeighborsRegressor, RandomForestRegressor, Lasso, ElasticNet, DecisionTreeRegressor, GradientBoostingRegressor, SVR]
s = []
s1 = []
s2 = []
s3 = []
s4 = []
s5 = []
s6 = []
s7 = []
for i in algos:
  scores(i)

In [None]:
models = pd.DataFrame({
    'Method': ['LinearRegression', 'KNeighborsRegressor', 'RandomForestRegressor', 'Lasso', 'ElasticNet',
               'DecisionTreeRegressor', 'GradientBoostingRegressor', 'SVR'],
    'r2 Scores' : [s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]],
    'Ajd r2 Score' : [s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], s1[7]],
    'Accuracy' : [s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], s2[7]],
    'MAPE' : [s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], s3[7]],
    'SMAPE' : [s4[0], s4[1], s4[2], s4[3], s4[4], s4[5], s4[6], s4[7]],
    'MAE' : [s5[0], s5[1], s5[2], s5[3], s5[4], s5[5], s5[6], s5[7]],
    'MSE' : [s6[0], s6[1], s6[2], s6[3], s6[4], s6[5], s6[6], s6[7]],
    'RMSE' : [s7[0], s7[1], s7[2], s7[3], s7[4], s7[5], s7[6], s7[7]]
})
models.sort_values(by='r2 Scores', ascending=False)

In [None]:
rfr_Model = RandomForestRegressor()
rfr_Model.fit(X, ytrain)
y_pred = rfr_Model.predict(X_test)
r2Score = r2_score(y_test, y_pred)
print('R2 Score',r2Score)         #0.9730024554109795,    0.973003845604028,    0.9781682423157221  
print('--------------------------------------------------------------')
nrow, ncol = X_train.shape 
x = 1-r2Score
y = nrow-1
z = nrow-1-ncol
adj_r2_score = 1 - ((x*y)/z)
print('Adjusted r2 Score',adj_r2_score)
print('--------------------------------------------------------------')
errors = abs(y_test - y_pred)
err = (abs(y_test) + abs(y_pred))/2
smape = np.mean((errors/err)*100)
print('sMAPE',smape)
print('--------------------------------------------------------------')
errors = abs(y_test - y_pred)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape
print('Accuracy = {:0.2f}%.'.format(accuracy))
print('--------------------------------------------------------------')
print('MAE', np.abs(y_pred-y_test).mean())
print('--------------------------------------------------------------')
print('RMSE: ', np.sqrt(((y_test - y_pred)**2).mean()))
print('--------------------------------------------------------------')
print('MSE: ', ((y_test - y_pred)**2).mean())

In [None]:
df_test_preds = rfr_Model.predict(test_ohe)

num_sold = pd.DataFrame(df_test_preds, columns=['num_sold'])

df_result = pd.concat([row_id.reset_index(drop=True), num_sold.reset_index(drop=True)], axis=1)

df_result.to_csv('sample_submission.csv', index=False)