In [425]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error
import random

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [422]:
# Things to consider - letting users select which columns they want to replace empties in (or all)

In [426]:
# Loading the df (the encoding thing might be an issue...let's play around with that) 
original_df = pd.read_csv("Restaurant_With_Empties.csv", encoding = "ISO-8859-1")

In [427]:
original_df.head(15)

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,7/17/99,ÛÁstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,3.0,5,3,4,5,5,4,3,4,5653753.0
1,1,2/14/08,Ankara,Big Cities,FC,4,5.0,,4.0,1,...,3.0,0,0,0,0,0,0,0,0,6923131.0
2,2,3/9/13,DiyarbakÛ±r,Other,IL,2,4.0,2.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,2055379.0
3,3,2/2/12,Tokat,,IL,6,4.5,6.0,6.0,4,...,7.5,25,12,10,6,18,12,12,6,2675511.0
4,4,5/9/09,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,3.0,5,1,3,2,3,4,3,3,4316715.0
5,5,2/12/10,Ankara,Big Cities,FC,6,6.0,4.5,7.5,8,...,5.0,0,0,0,0,0,0,0,0,
6,6,10/11/10,ÛÁstanbul,Big Cities,IL,2,3.0,4.0,4.0,1,...,3.0,4,5,2,2,3,5,4,4,5166635.0
7,7,6/21/11,ÛÁstanbul,Big Cities,IL,4,5.0,4.0,5.0,2,...,2.0,0,0,0,0,0,0,0,0,4491607.0
8,8,8/28/10,Afyonkarahisar,Other,IL,1,1.0,4.0,4.0,1,...,3.0,4,5,5,3,4,5,4,5,4952497.0
9,9,11/16/11,Edirne,,IL,6,4.5,6.0,7.5,6,...,2.5,0,0,0,0,0,0,0,0,5444227.0


In [428]:
# Looking to see the number of empties in each column (there should be 137 in each column if there were no empties) 
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 43 columns):
Id            137 non-null int64
Open Date     137 non-null object
City          137 non-null object
City Group    126 non-null object
Type          137 non-null object
P1            137 non-null int64
P2            137 non-null float64
P3            126 non-null float64
P4            137 non-null float64
P5            137 non-null int64
P6            137 non-null int64
P7            137 non-null int64
P8            137 non-null int64
P9            128 non-null float64
P10           137 non-null int64
P11           137 non-null int64
P12           97 non-null float64
P13           137 non-null float64
P14           137 non-null int64
P15           137 non-null int64
P16           137 non-null int64
P17           137 non-null int64
P18           137 non-null int64
P19           137 non-null int64
P20           137 non-null int64
P21           137 non-null int64
P22           137 non-

In [429]:
# Creating a list of any column with empty values
cols_with_empties = original_df.columns[original_df.isnull().any()].tolist()
cols_with_empties

['City Group', 'P3', 'P9', 'P12', 'revenue']

In [430]:
# Inserting temporary row for 0 (regression) or 1 (categorical)

In [431]:
original_df.head(15)

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,7/17/99,ÛÁstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,3.0,5,3,4,5,5,4,3,4,5653753.0
1,1,2/14/08,Ankara,Big Cities,FC,4,5.0,,4.0,1,...,3.0,0,0,0,0,0,0,0,0,6923131.0
2,2,3/9/13,DiyarbakÛ±r,Other,IL,2,4.0,2.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,2055379.0
3,3,2/2/12,Tokat,,IL,6,4.5,6.0,6.0,4,...,7.5,25,12,10,6,18,12,12,6,2675511.0
4,4,5/9/09,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,3.0,5,1,3,2,3,4,3,3,4316715.0
5,5,2/12/10,Ankara,Big Cities,FC,6,6.0,4.5,7.5,8,...,5.0,0,0,0,0,0,0,0,0,
6,6,10/11/10,ÛÁstanbul,Big Cities,IL,2,3.0,4.0,4.0,1,...,3.0,4,5,2,2,3,5,4,4,5166635.0
7,7,6/21/11,ÛÁstanbul,Big Cities,IL,4,5.0,4.0,5.0,2,...,2.0,0,0,0,0,0,0,0,0,4491607.0
8,8,8/28/10,Afyonkarahisar,Other,IL,1,1.0,4.0,4.0,1,...,3.0,4,5,5,3,4,5,4,5,4952497.0
9,9,11/16/11,Edirne,,IL,6,4.5,6.0,7.5,6,...,2.5,0,0,0,0,0,0,0,0,5444227.0


In [433]:
# Finding number of columns (will be used in next portion)
number_of_columns = len(list(original_df))
number_of_columns

43

In [434]:
# Create one empty row up top of 0's that we will eventually fill in 
original_df.loc[-1] = np.zeros(number_of_columns)
original_df.index = original_df.index + 1  # shifting index
original_df = original_df.sort_index()  # sorting by index

In [435]:
# Making sure this worked
original_df.head(10)

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,7/17/99,ÛÁstanbul,Big Cities,IL,4.0,5.0,4.0,4.0,2.0,...,3.0,5.0,3.0,4.0,5.0,5.0,4.0,3.0,4.0,5653753.0
2,1.0,2/14/08,Ankara,Big Cities,FC,4.0,5.0,,4.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6923131.0
3,2.0,3/9/13,DiyarbakÛ±r,Other,IL,2.0,4.0,2.0,5.0,2.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2055379.0
4,3.0,2/2/12,Tokat,,IL,6.0,4.5,6.0,6.0,4.0,...,7.5,25.0,12.0,10.0,6.0,18.0,12.0,12.0,6.0,2675511.0
5,4.0,5/9/09,Gaziantep,Other,IL,3.0,4.0,3.0,4.0,2.0,...,3.0,5.0,1.0,3.0,2.0,3.0,4.0,3.0,3.0,4316715.0
6,5.0,2/12/10,Ankara,Big Cities,FC,6.0,6.0,4.5,7.5,8.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
7,6.0,10/11/10,ÛÁstanbul,Big Cities,IL,2.0,3.0,4.0,4.0,1.0,...,3.0,4.0,5.0,2.0,2.0,3.0,5.0,4.0,4.0,5166635.0
8,7.0,6/21/11,ÛÁstanbul,Big Cities,IL,4.0,5.0,4.0,5.0,2.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4491607.0
9,8.0,8/28/10,Afyonkarahisar,Other,IL,1.0,1.0,4.0,4.0,1.0,...,3.0,4.0,5.0,5.0,3.0,4.0,5.0,4.0,5.0,4952497.0


In [436]:
# Acually inserting values into first row, with 0=regression and 1=categorical. For now, it's based solely on data 
# type, which will need to be changed

In [437]:
# Creating a dictionary that has the indeces all nulls for each column
null_indeces = {}
for col in cols_with_empties:
    null_indeces.update({col: original_df[col].index[original_df[col].apply(pd.isnull)]})

In [438]:
null_indeces

{'City Group': Int64Index([4, 10, 16, 26, 32, 38, 41, 76, 90, 103, 124], dtype='int64'),
 'P12': Int64Index([  2,   6,   9,  11,  13,  17,  20,  26,  29,  30,  33,  37,  39,
              42,  53,  59,  61,  69,  71,  78,  82,  84,  86,  90,  93,  95,
              98, 102, 104, 107, 110, 111, 116, 120, 124, 127, 128, 130, 133,
             136],
            dtype='int64'),
 'P3': Int64Index([2, 18, 24, 40, 48, 49, 64, 81, 98, 108, 122], dtype='int64'),
 'P9': Int64Index([4, 7, 20, 35, 40, 52, 90, 102, 125], dtype='int64'),
 'revenue': Int64Index([  6,  15,  19,  26,  31,  34,  40,  42,  47,  52,  56,  67,  70,
              77,  83,  84,  90,  92,  97,  98, 100, 104, 108, 113, 117, 120,
             122, 126, 129, 132, 135],
            dtype='int64')}

In [439]:
null_indeces['P12']

Int64Index([  2,   6,   9,  11,  13,  17,  20,  26,  29,  30,  33,  37,  39,
             42,  53,  59,  61,  69,  71,  78,  82,  84,  86,  90,  93,  95,
             98, 102, 104, 107, 110, 111, 116, 120, 124, 127, 128, 130, 133,
            136],
           dtype='int64')

In [440]:
# You can delete this later, but here you can see I'm checking out the data type of values from different columns 
# in the third row. O="object" and "float64" = float (example below)
original_df["City Group"].iloc[2:3].dtype

dtype('O')

In [441]:
original_df["P1"].iloc[2:3].dtype

dtype('float64')

In [442]:
# Loop to check data type of the second row of each column's data type is O (object, aka string). If so, the first 
# valuein that row is replaced with a 1, indicating that it's categorical and not a regression problem
for (counter, column) in enumerate(original_df):
    if original_df[column].iloc[2:3].dtype == 'O':
        original_df[column].iloc[0] = 1
        
# *Maybe we should add other elifs like if the column name contains "date" or "zip" we can assume it's categorical? 

In [443]:
# Checking to make sure it worked, which it did
original_df.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0.0,1,1,1,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,7/17/99,ÛÁstanbul,Big Cities,IL,4.0,5.0,4.0,4.0,2.0,...,3.0,5.0,3.0,4.0,5.0,5.0,4.0,3.0,4.0,5653753.0
2,1.0,2/14/08,Ankara,Big Cities,FC,4.0,5.0,,4.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6923131.0
3,2.0,3/9/13,DiyarbakÛ±r,Other,IL,2.0,4.0,2.0,5.0,2.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2055379.0
4,3.0,2/2/12,Tokat,,IL,6.0,4.5,6.0,6.0,4.0,...,7.5,25.0,12.0,10.0,6.0,18.0,12.0,12.0,6.0,2675511.0


In [444]:
# This converts all columns with "object" variables (AKA string) into numbers, and creates a dictionary  
row_one = original_df.iloc[0]
char_cols = original_df.dtypes.pipe(lambda x: x[x == 'object']).index
label_mapping = {}

for c in char_cols:
    original_df[c], label_mapping[c] = pd.factorize(original_df[c])

# This part is important because it brings the first row back to the 0/1's depending on whether it's categorical    
original_df.iloc[0] = row_one

In [445]:
char_cols

Index(['Open Date', 'City', 'City Group', 'Type'], dtype='object')

In [446]:
original_df.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0.0,1,1,1,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1,1,1,1,4.0,5.0,4.0,4.0,2.0,...,3.0,5.0,3.0,4.0,5.0,5.0,4.0,3.0,4.0,5653753.0
2,1.0,2,2,1,2,4.0,5.0,,4.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6923131.0
3,2.0,3,3,2,1,2.0,4.0,2.0,5.0,2.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2055379.0
4,3.0,4,4,-1,1,6.0,4.5,6.0,6.0,4.0,...,7.5,25.0,12.0,10.0,6.0,18.0,12.0,12.0,6.0,2675511.0


In [447]:
# Nulls in categorical variables were still converted to factors, so here I am setting them back to nulls based on 
# their indeces in each column 
for col in cols_with_empties:
    for item in null_indeces[col]:
        original_df[col].iloc[item] = np.nan

In [449]:
original_df.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0.0,1,1,1.0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1,1,1.0,1,4.0,5.0,4.0,4.0,2.0,...,3.0,5.0,3.0,4.0,5.0,5.0,4.0,3.0,4.0,5653753.0
2,1.0,2,2,1.0,2,4.0,5.0,,4.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6923131.0
3,2.0,3,3,2.0,1,2.0,4.0,2.0,5.0,2.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2055379.0
4,3.0,4,4,,1,6.0,4.5,6.0,6.0,4.0,...,7.5,25.0,12.0,10.0,6.0,18.0,12.0,12.0,6.0,2675511.0


In [450]:
# Here you can see the order it assigned the ints to string. In the City column, Istanbul=0, Ankara=1, etc. We 
# wouldn't actually print this out 
label_mapping

{'City': Index([               1,      'ÛÁstanbul',         'Ankara',    'DiyarbakÛ±r',
                 'Tokat',      'Gaziantep', 'Afyonkarahisar',         'Edirne',
               'Kocaeli',          'Bursa',         'ÛÁzmir',        'Sakarya',
              'ElazÛ±Ûô',        'Kayseri',     'Eskiôehir',    '_anlÛ±urfa',
                'Samsun',          'Adana',        'Antalya',      'Kastamonu',
                 'Uôak',         'MuÛôla',    'KÛ±rklareli',          'Konya',
              'Karabí_k',      'TekirdaÛô',        'Denizli',     'BalÛ±kesir',
                'AydÛ±n',         'Amasya',       'Kí_tahya',           'Bolu',
               'Trabzon',        'Isparta',       'Osmaniye'],
       dtype='object'),
 'City Group': Index([1, 'Big Cities', 'Other'], dtype='object'),
 'Open Date': Index([         1,  '7/17/99',  '2/14/08',   '3/9/13',   '2/2/12',   '5/9/09',
         '2/12/10', '10/11/10',  '6/21/11',  '8/28/10',
        ...
          '9/7/07', '10/14/11',   '2/8

In [451]:
# If you want to access the mapping for one specific column you can do it this way
label_mapping["City Group"]

Index([1, 'Big Cities', 'Other'], dtype='object')

In [452]:
# Checking out df to make sure string values were recoded into numbers
original_df.head(10)

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0.0,1,1,1.0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1,1,1.0,1,4.0,5.0,4.0,4.0,2.0,...,3.0,5.0,3.0,4.0,5.0,5.0,4.0,3.0,4.0,5653753.0
2,1.0,2,2,1.0,2,4.0,5.0,,4.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6923131.0
3,2.0,3,3,2.0,1,2.0,4.0,2.0,5.0,2.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2055379.0
4,3.0,4,4,,1,6.0,4.5,6.0,6.0,4.0,...,7.5,25.0,12.0,10.0,6.0,18.0,12.0,12.0,6.0,2675511.0
5,4.0,5,5,2.0,1,3.0,4.0,3.0,4.0,2.0,...,3.0,5.0,1.0,3.0,2.0,3.0,4.0,3.0,3.0,4316715.0
6,5.0,6,2,1.0,2,6.0,6.0,4.5,7.5,8.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
7,6.0,7,1,1.0,1,2.0,3.0,4.0,4.0,1.0,...,3.0,4.0,5.0,2.0,2.0,3.0,5.0,4.0,4.0,5166635.0
8,7.0,8,1,1.0,1,4.0,5.0,4.0,5.0,2.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4491607.0
9,8.0,9,6,2.0,1,1.0,1.0,4.0,4.0,1.0,...,3.0,4.0,5.0,5.0,3.0,4.0,5.0,4.0,5.0,4952497.0


In [453]:
# Creating a copy. We'll be adding to full_df as we go through the columns, but NOT pulling from it in training 
# any models. Always pull from original_df for that
full_df = original_df.copy()

In [454]:
for col in cols_with_empties:
    
# If it's categorical
    if original_df[col].iloc[0] == 1:

# Splitting into rows where they do have that value and rows where they don't
        df_complete = original_df.dropna(subset=[col])
        df_empty = original_df.loc[original_df[col].isnull()]
        
# Splitting into features/target for training set
        features = df_complete.drop(col, axis=1)
    
        # Replacing empties in other columns with the column median
        features = features.fillna(features.median())
        
        target = df_complete[col]
        features_empty = df_empty.drop(col, axis=1)

# Training random forest
        randFor = RandomForestClassifier()
        randFor.fit(features, target)

# Set of  predictions for the rows with empty values 
        features_empty = features_empty.fillna(features.median())
        y_pred_randFor = randFor.predict(features_empty)
    

# Replacing the empty values in each column with the y_pred_randFor predictions:
    # Creates a list of all the indeces where that column is null
        index = full_df[col].index[full_df[col].apply(np.isnan)]
        df_index = full_df.index.values.tolist()
        empty_indices = [df_index.index(i) for i in index]

    # Creating a list of answers from that column
        full_df_column = np.array(full_df[col])
        full_df_column_list = list(full_df_column)

    # Creating a list of answers from that column
        for (counter, item) in enumerate(empty_indices):
            full_df_column_list[empty_indices[counter]] = y_pred_randFor[counter]

    # Replace the column of actuals + nulls with an array of actuals + predictions  
        full_df[col] = np.array(full_df_column_list)
        
        
# Same process for regression
    if original_df[col].iloc[0] == 0:

# Splitting into rows where they do have that value and rows where they don't
        df_complete = original_df.dropna(subset=[col])
        df_empty = original_df.loc[original_df[col].isnull()]
        
# Splitting into features/target for training set
        features = df_complete.drop(col, axis=1)
    
        # Replacing empties in other columns with the column median
        features = features.fillna(features.median())
        
        target = df_complete[col]
        features_empty = df_empty.drop(col, axis=1)

# Training random forest
        ridgereg = Ridge(normalize=True)
        ridgereg.fit(features, target)

# Set of  predictions for the rows with empty values 
        features_empty = features_empty.fillna(features.median())
        y_pred_ridgereg = ridgereg.predict(features_empty)
    

# Replacing the empty values in each column with the y_pred_ridgereg predictions:
    # Creates a list of all the indeces where that column is null
        index = full_df[col].index[full_df[col].apply(np.isnan)]
        df_index = full_df.index.values.tolist()
        empty_indices = [df_index.index(i) for i in index]

    # Creating an initial list of answers from that column
        full_df_column = np.array(full_df[col])
        full_df_column_list = list(full_df_column)

    # Updating answers with ridge predictions
        for (counter, item) in enumerate(empty_indices):
            full_df_column_list[empty_indices[counter]] = y_pred_ridgereg[counter]

    # Replace the column of actuals + nulls with an array of actuals + predictions  
        full_df[col] = np.array(full_df_column_list)

In [455]:
full_df.head(10)

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0.0,1,1,1.0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1,1,1.0,1,4.0,5.0,4.0,4.0,2.0,...,3.0,5.0,3.0,4.0,5.0,5.0,4.0,3.0,4.0,5653753.0
2,1.0,2,2,1.0,2,4.0,5.0,4.049651,4.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6923131.0
3,2.0,3,3,2.0,1,2.0,4.0,2.0,5.0,2.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2055379.0
4,3.0,4,4,1.0,1,6.0,4.5,6.0,6.0,4.0,...,7.5,25.0,12.0,10.0,6.0,18.0,12.0,12.0,6.0,2675511.0
5,4.0,5,5,2.0,1,3.0,4.0,3.0,4.0,2.0,...,3.0,5.0,1.0,3.0,2.0,3.0,4.0,3.0,3.0,4316715.0
6,5.0,6,2,1.0,2,6.0,6.0,4.5,7.5,8.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5538004.0
7,6.0,7,1,1.0,1,2.0,3.0,4.0,4.0,1.0,...,3.0,4.0,5.0,2.0,2.0,3.0,5.0,4.0,4.0,5166635.0
8,7.0,8,1,1.0,1,4.0,5.0,4.0,5.0,2.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4491607.0
9,8.0,9,6,2.0,1,1.0,1.0,4.0,4.0,1.0,...,3.0,4.0,5.0,5.0,3.0,4.0,5.0,4.0,5.0,4952497.0


In [456]:
cols_with_empties

['City Group', 'P3', 'P9', 'P12', 'revenue']

In [457]:
label_mapping['City Group'][1]

'Big Cities'

In [458]:
full_column_list = list(full_df)

In [459]:
for col in full_column_list:
    if full_df[col].iloc[0] == 1:
        full_df[col] = full_df[col].astype(int)
        for counter, item in enumerate(full_df[col]):
            full_df[col].iloc[counter] = label_mapping[col][item] 

In [460]:
full_df.head(10)

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0.0,7/17/99,ÛÁstanbul,Big Cities,IL,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,7/17/99,ÛÁstanbul,Big Cities,IL,4.0,5.0,4.0,4.0,2.0,...,3.0,5.0,3.0,4.0,5.0,5.0,4.0,3.0,4.0,5653753.0
2,1.0,2/14/08,Ankara,Big Cities,FC,4.0,5.0,4.049651,4.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6923131.0
3,2.0,3/9/13,DiyarbakÛ±r,Other,IL,2.0,4.0,2.0,5.0,2.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2055379.0
4,3.0,2/2/12,Tokat,Big Cities,IL,6.0,4.5,6.0,6.0,4.0,...,7.5,25.0,12.0,10.0,6.0,18.0,12.0,12.0,6.0,2675511.0
5,4.0,5/9/09,Gaziantep,Other,IL,3.0,4.0,3.0,4.0,2.0,...,3.0,5.0,1.0,3.0,2.0,3.0,4.0,3.0,3.0,4316715.0
6,5.0,2/12/10,Ankara,Big Cities,FC,6.0,6.0,4.5,7.5,8.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5538004.0
7,6.0,10/11/10,ÛÁstanbul,Big Cities,IL,2.0,3.0,4.0,4.0,1.0,...,3.0,4.0,5.0,2.0,2.0,3.0,5.0,4.0,4.0,5166635.0
8,7.0,6/21/11,ÛÁstanbul,Big Cities,IL,4.0,5.0,4.0,5.0,2.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4491607.0
9,8.0,8/28/10,Afyonkarahisar,Other,IL,1.0,1.0,4.0,4.0,1.0,...,3.0,4.0,5.0,5.0,3.0,4.0,5.0,4.0,5.0,4952497.0


In [461]:
full_df.shape

(138, 43)

In [462]:
# Getting rid of the top row (which was just 0=regression and 1=categorical)
complete_df = full_df.iloc[1:]

In [465]:
complete_df.head(50)

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
1,0.0,7/17/99,ÛÁstanbul,Big Cities,IL,4.0,5.0,4.0,4.0,2.0,...,3.0,5.0,3.0,4.0,5.0,5.0,4.0,3.0,4.0,5653753.0
2,1.0,2/14/08,Ankara,Big Cities,FC,4.0,5.0,4.049651,4.0,1.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6923131.0
3,2.0,3/9/13,DiyarbakÛ±r,Other,IL,2.0,4.0,2.0,5.0,2.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2055379.0
4,3.0,2/2/12,Tokat,Big Cities,IL,6.0,4.5,6.0,6.0,4.0,...,7.5,25.0,12.0,10.0,6.0,18.0,12.0,12.0,6.0,2675511.0
5,4.0,5/9/09,Gaziantep,Other,IL,3.0,4.0,3.0,4.0,2.0,...,3.0,5.0,1.0,3.0,2.0,3.0,4.0,3.0,3.0,4316715.0
6,5.0,2/12/10,Ankara,Big Cities,FC,6.0,6.0,4.5,7.5,8.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5538004.0
7,6.0,10/11/10,ÛÁstanbul,Big Cities,IL,2.0,3.0,4.0,4.0,1.0,...,3.0,4.0,5.0,2.0,2.0,3.0,5.0,4.0,4.0,5166635.0
8,7.0,6/21/11,ÛÁstanbul,Big Cities,IL,4.0,5.0,4.0,5.0,2.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4491607.0
9,8.0,8/28/10,Afyonkarahisar,Other,IL,1.0,1.0,4.0,4.0,1.0,...,3.0,4.0,5.0,5.0,3.0,4.0,5.0,4.0,5.0,4952497.0
10,9.0,11/16/11,Edirne,Big Cities,IL,6.0,4.5,6.0,7.5,6.0,...,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5444227.0


In [464]:
complete_df.shape

(137, 43)

In [468]:
# Functionality to export final dataframe as csv