In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, recall_score, precision_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("train.csv", sep=',',  na_values ='unknown')

test = pd.read_csv("test.csv", sep=',',  na_values='unknown')

print(train.shape)
print(test.shape)



(137, 43)
(100000, 42)


In [3]:
print(list(train.columns))

print(train.describe(include = 'all'))

['Id', 'Open Date', 'City', 'City Group', 'Type', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25', 'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35', 'P36', 'P37', 'revenue']
                Id   Open Date      City  City Group Type          P1  \
count   137.000000         137       137         137  137  137.000000   
unique         NaN         134        34           2    3         NaN   
top            NaN  01/07/2000  İstanbul  Big Cities   FC         NaN   
freq           NaN           2        50          78   76         NaN   
mean     68.000000         NaN       NaN         NaN  NaN    4.014599   
std      39.692569         NaN       NaN         NaN  NaN    2.910391   
min       0.000000         NaN       NaN         NaN  NaN    1.000000   
25%      34.000000         NaN       NaN         NaN  NaN    2.000000   
50%      68.000000         NaN  

In [4]:
print(list(test.columns))

print(test.describe(include = 'all'))

['Id', 'Open Date', 'City', 'City Group', 'Type', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25', 'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35', 'P36', 'P37']
                   Id   Open Date      City City Group    Type             P1  \
count   100000.000000      100000    100000     100000  100000  100000.000000   
unique            NaN         310        57          2       4            NaN   
top               NaN  07/10/2013  İstanbul      Other      FC            NaN   
freq              NaN         645     34087      50728   57019            NaN   
mean     49999.500000         NaN       NaN        NaN     NaN       4.088030   
std      28867.657797         NaN       NaN        NaN     NaN       2.812963   
min          0.000000         NaN       NaN        NaN     NaN       1.000000   
25%      24999.750000         NaN       NaN        N

In [5]:
# Creating column which is the year in which the restaurant was opened

train['year'] = pd.DatetimeIndex(train['Open Date']).year 
test['year'] = pd.DatetimeIndex(test['Open Date']).year


In [6]:
corr = train.corr()
corr.to_csv("Correlation.csv")

In [7]:
train.dtypes

Id              int64
Open Date      object
City           object
City Group     object
Type           object
P1              int64
P2            float64
P3            float64
P4            float64
P5              int64
P6              int64
P7              int64
P8              int64
P9              int64
P10             int64
P11             int64
P12             int64
P13           float64
P14             int64
P15             int64
P16             int64
P17             int64
P18             int64
P19             int64
P20             int64
P21             int64
P22             int64
P23             int64
P24             int64
P25             int64
P26           float64
P27           float64
P28           float64
P29           float64
P30             int64
P31             int64
P32             int64
P33             int64
P34             int64
P35             int64
P36             int64
P37             int64
revenue       float64
year            int64
dtype: object

In [8]:
for col in [ 'City', 'City Group', 'Type']: #, 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25', 'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35', 'P36', 'P37']:
    train[col] = train[col].astype("category") 
    test[col] = test[col].astype("category") 
    

In [10]:
testId = test.Id

train = train.drop(["Id", "Open Date"], axis =1 ) 
test = test.drop(["Id", "Open Date"], axis =1 )  

#train = train.drop(['P2', "P7", 'P19', 'P20', 'P28', 'P9', 'P10', 'P12', 'P13', 'P15', 'P16', 'P17', 'P18', 'P24', 'P25', 'P26', 'P30', 'P32', 'P33', 'P34', 'P35', 'P36', 'P37', 'P23'], axis =1 ) 
#test  =  test.drop(['P2', "P7", 'P19', 'P20', 'P28', 'P9', 'P10', 'P12', 'P13', 'P15', 'P16', 'P17', 'P18', 'P24', 'P25', 'P26', 'P30', 'P32', 'P33', 'P34', 'P35', 'P36', 'P37', 'P23'], axis =1 ) 

train.corr()

Unnamed: 0,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,...,P30,P31,P32,P33,P34,P35,P36,P37,revenue,year
P1,1.0,0.836671,0.695928,0.677941,0.30282,0.747296,0.853753,0.76082,0.765743,0.795885,...,0.487589,0.479175,0.552593,0.329795,0.570835,0.449182,0.554951,0.277601,0.070217,-0.083658
P2,0.836671,1.0,0.473551,0.540322,0.203003,0.673106,0.769842,0.505116,0.481635,0.498135,...,0.342005,0.353477,0.389376,0.234991,0.394319,0.314798,0.387938,0.161162,0.191518,-0.16995
P3,0.695928,0.473551,1.0,0.443606,0.168891,0.570486,0.651489,0.689173,0.741967,0.764191,...,0.473089,0.406701,0.463217,0.312269,0.501575,0.428504,0.484942,0.284065,-0.024613,-0.0142
P4,0.677941,0.540322,0.443606,1.0,0.715406,0.595724,0.700482,0.654594,0.675336,0.685416,...,0.387747,0.333626,0.440795,0.18282,0.434245,0.277902,0.411664,0.177916,0.035685,0.019542
P5,0.30282,0.203003,0.168891,0.715406,1.0,0.326524,0.446154,0.447577,0.465662,0.462299,...,0.244032,0.112603,0.215568,0.071079,0.24304,0.173309,0.193652,0.033548,-0.028191,0.116705
P6,0.747296,0.673106,0.570486,0.595724,0.326524,1.0,0.738388,0.720144,0.676323,0.654594,...,0.460618,0.477593,0.519243,0.300611,0.519635,0.413645,0.524552,0.327594,0.139094,-0.18757
P7,0.853753,0.769842,0.651489,0.700482,0.446154,0.738388,1.0,0.751196,0.775003,0.775642,...,0.509213,0.452471,0.531011,0.339178,0.562561,0.457721,0.543563,0.309678,0.051165,0.006063
P8,0.76082,0.505116,0.689173,0.654594,0.447577,0.720144,0.751196,1.0,0.919956,0.88165,...,0.454937,0.420425,0.516617,0.256416,0.524474,0.405709,0.499777,0.308294,-0.084215,0.040025
P9,0.765743,0.481635,0.741967,0.675336,0.465662,0.676323,0.775003,0.919956,1.0,0.961266,...,0.50267,0.467334,0.544749,0.326869,0.582042,0.467916,0.551953,0.330927,-0.050352,0.076214
P10,0.795885,0.498135,0.764191,0.685416,0.462299,0.654594,0.775642,0.88165,0.961266,1.0,...,0.514738,0.462235,0.549238,0.320317,0.601956,0.48301,0.565068,0.313748,-0.07322,0.077591


In [11]:
train.isnull().sum()

City          0
City Group    0
Type          0
P1            0
P2            0
P3            0
P4            0
P5            0
P6            0
P7            0
P8            0
P9            0
P10           0
P11           0
P12           0
P13           0
P14           0
P15           0
P16           0
P17           0
P18           0
P19           0
P20           0
P21           0
P22           0
P23           0
P24           0
P25           0
P26           0
P27           0
P28           0
P29           0
P30           0
P31           0
P32           0
P33           0
P34           0
P35           0
P36           0
P37           0
revenue       0
year          0
dtype: int64

In [12]:
test.isnull().sum()

City          0
City Group    0
Type          0
P1            0
P2            0
P3            0
P4            0
P5            0
P6            0
P7            0
P8            0
P9            0
P10           0
P11           0
P12           0
P13           0
P14           0
P15           0
P16           0
P17           0
P18           0
P19           0
P20           0
P21           0
P22           0
P23           0
P24           0
P25           0
P26           0
P27           0
P28           0
P29           0
P30           0
P31           0
P32           0
P33           0
P34           0
P35           0
P36           0
P37           0
year          0
dtype: int64

In [13]:
X_train, y_train = train.loc[:,train.columns!='revenue'], train.loc[:,'revenue']

X_test = test.loc[:,test.columns!='revenue']

#X_test, y_test = test.loc[:,test.columns!='revenue'], test.loc[:,'revenue']

In [14]:
cat_attr = list(X_train.select_dtypes("category").columns)
num_attr = list(X_train.columns.difference(cat_attr))
print(cat_attr)
print(num_attr)
num_attr = list(X_train.columns.difference(cat_attr))


['City', 'City Group', 'Type']
['P1', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P2', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25', 'P26', 'P27', 'P28', 'P29', 'P3', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35', 'P36', 'P37', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'year']


In [15]:
numeric_transformer = Pipeline(steps =[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing_value')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_attr),
        ('cat', categorical_transformer, cat_attr)
    ])

# Building Linear Model

In [16]:
linearReg = Pipeline(steps =[('preprocessor', preprocessor),
                             ('classifier', LinearRegression())])

In [17]:
linearReg.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [18]:
train_pred = linearReg.predict(X_train)
test_pred = linearReg.predict(X_test)

In [19]:
dict = {"Id": testId, "Prediction" : test_pred}
predictLinear = pd.DataFrame(dict)

predictLinear.to_csv("LinearData.csv")

# Building PCA model

In [20]:
from sklearn.decomposition import PCA 

X_train = X_train.drop(['City', 'City Group', 'Type'], axis =1 ) 
X_test = X_test.drop(['City', 'City Group', 'Type'], axis =1 )  


In [22]:
pca =  PCA(n_components =8) #n_components = 'mle',    

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test) 

pca.explained_variance_

array([208.91749672,  53.08767338,  17.46592127,   9.97588954,
         6.91145616,   5.11620872,   4.82572766,   4.01569449])

In [23]:
pca.explained_variance_

array([208.91749672,  53.08767338,  17.46592127,   9.97588954,
         6.91145616,   5.11620872,   4.82572766,   4.01569449])

In [24]:
pca.explained_variance_ratio_ *100

array([62.79401571, 15.95648162,  5.24970551,  2.99843802,  2.07736591,
        1.53777111,  1.45046166,  1.20699121])

In [25]:
pca.n_components_

8

In [26]:

classifier = LinearRegression() 
classifier.fit(X_train, y_train) 

y_pred = classifier.predict(X_test) 

dict = {"Id": testId, "Prediction" : y_pred}
predictDF = pd.DataFrame(dict)

predictDF.to_csv("PCA.csv")