In [5]:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
# from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import classification_report
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn import neighbors
from sklearn.svm import SVC
from sklearn import svm
from sklearn import grid_search
import random

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')



<b> Defining the filling empty values function </b>

In [7]:
import sys

def fillEmpty(original_df,colnum,flag):
    # assumed: df is the dataframe to operate on,
    # colnum is the column number with missing values
    # flag = 0/1 is whether it is a classification or regression problem
    
    # copying the original dataframe
    df = original_df.copy()
    
    # testing for valid flag
    if(flag != 0 and flag != 1):
        print('Invalid input flag')
        sys.exit()
    # testing for valid column number
    number_of_columns = len(list(original_df))
    if(colnum < 0 or colnum >= number_of_columns):
        print('Invalid input column number')
        sys.exit()
    # testing for the existence of empty column values
    a = df.iloc[:, [colnum]].isnull()
    idx = []
    for col in a:
        i=0
        for c in a[col]:
            if(c == True):
                idx.append(i)
            i=i+1
    if(len(idx) == 0):
        print('No empty values for input column number')
        sys.exit()
    
    # now can start pre-processing:
    # This converts all columns with "object" variables (AKA string) into numbers, and creates a dictionary  
    char_cols = df.dtypes.pipe(lambda x: x[x == 'object']).index
    label_mapping = {}
    for c in char_cols:
        df[c], label_mapping[c] = pd.factorize(df[c])
        
    # for the sake of classifying/ predicting current column,
    # with other columns may having null values,
    # we will replace the other columns' null val
    for c in df:
        df[c] = df[c].fillna(df[c].mean())

    # although some issue with the above may arise if null:
    # re-set all null values to null
    df.iloc[idx,[colnum]] = np.nan
    # Accessing the rows without empty values at colnum
    df_complete = df.dropna()
    df_complete.shape
    # Accessing the rows with empty values at colnum
    df_empty = df.iloc[idx]
    df_empty.shape
    # Splitting complete rows into target/features
    features = df_complete.drop(df.columns[[colnum]], axis=1)
    target_variable = df_complete.iloc[:, [colnum]]
    # Splitting the rows with empty colnum into features and response (which is what we're predicting)
    features_empty = df_empty.drop(df.columns[[colnum]], axis=1)
    
    # now can start classifying/ predicting:
    if(flag==0): # classification
        from sklearn.ensemble import RandomForestClassifier
        print('Classifying...')
        print()
        # Training random forest
        randFor = RandomForestClassifier(n_estimators = 20)
        randFor.fit(features, target_variable)
        # Accuracy on the training set set
        print('Training score: ',randFor.score(features, target_variable))
        # Set of "City Group" predictions for the rows with empty values 
        y_pred_randFor = randFor.predict(features_empty)
        print(y_pred_randFor)
    else: # prediction
        from sklearn.linear_model import Ridge
        print('Predicting...')
        print()
        # Ridge Regression
        ridgereg = Ridge(normalize=True)
        ridgereg.fit(features,target_variable)
        y_pred_ridge = ridgereg.predict(features_empty)
        print (y_pred_ridge)

In [32]:
# this is the second version of fillEmpty:
# instead of providing a column number,
# the user should provide a column name

import sys

def fillEmpty2(original_df,colname,flag):
    # assumed: df is the dataframe to operate on,
    # colnum is the column number with missing values
    # flag = 0/1 is whether it is a classification or regression problem
    
    # copying the original dataframe
    df = original_df.copy()
    
    # testing for valid flag
    if(flag != 0 and flag != 1):
        print('Invalid input flag')
        sys.exit()
    # testing for valid column number
    number_of_columns = len(list(original_df))
    if not(colname in df):
        print('Invalid input column name')
        sys.exit()
    # retrieve the column number
    colnum = df.columns.get_loc(colname)
    # testing for the existence of empty column values
    a = df.iloc[:, [colnum]].isnull()
    idx = []
    for col in a:
        i=0
        for c in a[col]:
            if(c == True):
                idx.append(i)
            i=i+1
    if(len(idx) == 0):
        print('No empty values for input column number')
        sys.exit()
    
    # now can start pre-processing:
    # This converts all columns with "object" variables (AKA string) into numbers, and creates a dictionary  
    char_cols = df.dtypes.pipe(lambda x: x[x == 'object']).index
    label_mapping = {}
    for c in char_cols:
        df[c], label_mapping[c] = pd.factorize(df[c])
        
    # for the sake of classifying/ predicting current column,
    # with other columns may having null values,
    # we will replace the other columns' null val
    for c in df:
        df[c] = df[c].fillna(df[c].mean())

    # although some issue with the above may arise if null:
    # re-set all null values to null
    df.iloc[idx,[colnum]] = np.nan
    # Accessing the rows without empty values at colnum
    df_complete = df.dropna()
    df_complete.shape
    # Accessing the rows with empty values at colnum
    df_empty = df.iloc[idx]
    df_empty.shape
    # Splitting complete rows into target/features
    features = df_complete.drop(df.columns[[colnum]], axis=1)
    target_variable = df_complete.iloc[:, [colnum]]
    # Splitting the rows with empty colnum into features and response (which is what we're predicting)
    features_empty = df_empty.drop(df.columns[[colnum]], axis=1)
    
    # now can start classifying/ predicting:
    if(flag==0): # classification
        from sklearn.ensemble import RandomForestClassifier
        print('Classifying...')
        print()
        # Training random forest
        randFor = RandomForestClassifier(n_estimators = 20)
        randFor.fit(features, target_variable)
        # Accuracy on the training set set
        print('Training score: ',randFor.score(features, target_variable))
        # Set of "City Group" predictions for the rows with empty values 
        y_pred_randFor = randFor.predict(features_empty)
        print(y_pred_randFor)
    else: # prediction
        from sklearn.linear_model import Ridge
        print('Predicting...')
        print()
        # Ridge Regression
        ridgereg = Ridge(normalize=True)
        ridgereg.fit(features,target_variable)
        y_pred_ridge = ridgereg.predict(features_empty)
        print (y_pred_ridge)

In [12]:
# this is the third version of fillEmpty:
# it fills the empty values of the dataframe,
# and creates a new column indicating if
# a column value was synthesized

import sys

def fillEmptyNew(original_df,colname,flag):
    # assumed: df is the dataframe to operate on,
    # colnum is the column number with missing values
    # flag = 0/1 is whether it is a classification or regression problem
    
    # copying the original dataframe
    df = original_df.copy()
    
    # testing for valid flag
    if(flag != 0 and flag != 1):
        print('Invalid input flag')
        sys.exit()
    # testing for valid column number
    number_of_columns = len(list(original_df))
    if not(colname in df):
        print('Invalid input column name')
        sys.exit()
    # retrieve the column number
    colnum = df.columns.get_loc(colname)
    # testing for the existence of empty column values
    a = df.iloc[:, [colnum]].isnull()
    idx = []
    for col in a:
        i=0
        for c in a[col]:
            if(c == True):
                idx.append(i)
            i=i+1
    if(len(idx) == 0):
        print('No empty values for input column number')
        sys.exit()
    
    # now can start pre-processing:
    # This converts all columns with "object" variables (AKA string) into numbers, and creates a dictionary  
    char_cols = df.dtypes.pipe(lambda x: x[x == 'object']).index
    label_mapping = {}
    for c in char_cols:
        df[c], label_mapping[c] = pd.factorize(df[c])
        
    # for the sake of classifying/ predicting current column,
    # with other columns may having null values,
    # we will replace the other columns' null val
    for c in df:
        df[c] = df[c].fillna(df[c].mean())

    # although some issue with the above may arise if null:
    # re-set all null values to null
    df.iloc[idx,[colnum]] = np.nan
    # Accessing the rows without empty values at colnum
    df_complete = df.dropna()
    df_complete.shape
    # Accessing the rows with empty values at colnum
    df_empty = df.iloc[idx]
    df_empty.shape
    # Splitting complete rows into target/features
    features = df_complete.drop(df.columns[[colnum]], axis=1)
    target_variable = df_complete.iloc[:, [colnum]]
    # Splitting the rows with empty colnum into features and response (which is what we're predicting)
    features_empty = df_empty.drop(df.columns[[colnum]], axis=1)
    
    # now can start classifying/ predicting:
    y_new = []
    if(flag==0): # classification
        from sklearn.ensemble import RandomForestClassifier
        print('Classifying...')
        print()
        # Training random forest
        randFor = RandomForestClassifier(n_estimators = 20)
        randFor.fit(features, target_variable)
        # Accuracy on the training set set
        print('Training score: ',randFor.score(features, target_variable))
        # Set of "City Group" predictions for the rows with empty values 
        y_new = randFor.predict(features_empty)
    else: # prediction
        from sklearn.linear_model import Ridge
        print('Predicting...')
        print()
        # Ridge Regression
        ridgereg = Ridge(normalize=True)
        ridgereg.fit(features,target_variable)
        y_new = ridgereg.predict(features_empty)
    # add the new values to the dataframe
    df.iloc[idx,[colnum]] = y_new
    df[colname+'_synthesized'] = 0
    df[colname+'_synthesized'][idx] = 1
    print(df)

<b> Testing on Restaurant.csv </b>

In [4]:
original_df = pd.read_csv("Restaurant.csv")
original_df.head()

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,07/17/1999,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,3.0,5,3,4,5,5,4,3,4,5653753.0
1,1,02/14/2008,Ankara,Big Cities,FC,4,5.0,4.0,4.0,1,...,3.0,0,0,0,0,0,0,0,0,6923131.0
2,2,03/09/2013,Diyarbakır,Other,IL,2,4.0,2.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,2055379.0
3,3,02/02/2012,Tokat,Other,IL,6,4.5,6.0,6.0,4,...,7.5,25,12,10,6,18,12,12,6,2675511.0
4,4,05/09/2009,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,3.0,5,1,3,2,3,4,3,3,4316715.0


In [34]:
df = original_df.copy()
# List of random numbers (no repeats) between 1 and 137 and then delete A_follower_count[that row]   
rows_to_delete = random.sample(range(137), 30)
# Deleting values from those rows
for x in rows_to_delete:
    df['revenue'][x] = np.nan
# Creating array of the correct answers from original data frame  
deleted_answer_list = []
for x in rows_to_delete:
    deleted_answer_list.append(original_df['revenue'][x])
# Converting it to array from a list so we can perform certain calculations
deleted_answer_array = np.array(deleted_answer_list)
deleted_answer_array

array([ 3956086.,  3570392.,  6363241.,  7217634.,  3199619.,  4491607.,
        3248660.,  4136425.,  2097022.,  3810007.,  2364478.,  3753720.,
        7201784.,  3939804.,  3008199.,  4651866.,  3784230.,  2525375.,
        4052733.,  6941173.,  8904084.,  3445076.,  6782425.,  4758476.,
        3347767.,  1847826.,  4882985.,  4429512.,  1270499.,  2732645.])

In [48]:
# test the function
fillEmpty(df,42,1) # prediction problem

Predicting...

[[ 4859740.08350571]
 [ 3728457.71280077]
 [ 4747301.67852731]
 [ 3222950.8043993 ]
 [ 4561678.81569383]
 [ 4539431.13895401]
 [ 3997883.59249391]
 [ 4937836.34377644]
 [ 5759218.47930182]
 [ 5509910.54931136]
 [ 5339310.54251825]
 [ 5799161.564878  ]
 [ 4068655.21856287]
 [ 5832835.33173468]
 [ 4089367.20097733]
 [ 4078853.43726617]
 [ 3968482.16226212]
 [ 4153452.39903516]
 [ 4380404.21429641]
 [ 4164756.72078256]
 [ 5480252.50591854]
 [ 4185454.07835212]
 [ 3779801.40969928]
 [ 4994081.80228943]
 [ 4948974.7532436 ]
 [ 5257586.93332632]
 [ 5056732.56955768]
 [ 5426897.76439071]
 [ 5008856.98454723]
 [ 5118072.47283297]]


In [49]:
df2 = original_df.copy()
# List of random numbers (no repeats) between 1 and 137 and then delete A_follower_count[that row]   
rows_to_delete = random.sample(range(137), 30)
# Deleting values from those rows
for x in rows_to_delete:
    df2['City Group'][x] = np.nan
# Creating array of the correct answers from original data frame  
deleted_answer_list = []
for x in rows_to_delete:
    deleted_answer_list.append(original_df['City Group'][x])
# Converting it to array from a list so we can perform certain calculations
deleted_answer_array_city_group = np.array(deleted_answer_list)
deleted_answer_array_city_group

array(['Other', 'Big Cities', 'Other', 'Other', 'Other', 'Other', 'Other',
       'Big Cities', 'Other', 'Big Cities', 'Other', 'Big Cities',
       'Big Cities', 'Other', 'Big Cities', 'Other', 'Other', 'Big Cities',
       'Other', 'Big Cities', 'Big Cities', 'Other', 'Big Cities', 'Other',
       'Big Cities', 'Big Cities', 'Big Cities', 'Big Cities', 'Other',
       'Big Cities'], 
      dtype='<U10')

In [50]:
# test the function
fillEmpty(df2,3,0) # classification problem

Classifying...

Training score:  1.0
[ 0.  1.  1.  0.  1.  1.  1.  1.  0.  0.  1.  0.  1.  0.  0.  1.  0.  0.
  0.  0.  0.  1.  1.  0.  0.  1.  1.  0.  0.  0.]


<b> Testing on titanic_with_empties.csv </b>

In [51]:
titanic_df = pd.read_csv("titanic_with_empties.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [52]:
# test the function on Cabin
fillEmpty(titanic_df,10,0) # classification problem

Classifying...

Training score:  1.0
[  15.   15.    5.  131.    3.    3.    1.   15.   23.    3.    4.    3.
    5.    3.    5.    5.    3.    3.   23.   15.  131.   15.    6.  131.
    5.    8.   16.   15.   15.    3.    3.    3.    9.    5.   26.    3.
   15.    3.  131.   23.    3.    7.   15.    9.   14.   15.   26.    7.
   15.    7.  115.    3.   15.    7.   15.   14.    7.   19.    3.   73.
   15.   15.   26.   22.   15.   15.  131.   20.   14.    7.    3.   15.
   15.   15.   15.   23.   15.   15.   22.   26.   15.   15.   15.   15.
   15.   15.   15.   15.   23.    3.   15.    3.    3.   15.  131.   22.
    7.   19.   15.   26.    3.   15.   15.   15.   15.   15.    3.   26.
   22.   52.   15.    3.   15.    3.   15.   15.   26.   15.    3.   52.
   45.   15.    3.   15.   29.   15.   15.   15.    7.    3.   22.   15.
   15.   32.   26.    3.   50.    3.    3.    3.   15.    3.    7.   22.
   15.    7.   52.    7.    3.    3.   34.    3.   15.   22.   15.    3.
  131.    3.  

In [53]:
# test the function on Survived
fillEmpty(titanic_df,1,0) # classification problem

Classifying...

Training score:  0.991907514451
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.
  0.  0.  0.  0.  1.  0.  0.  0.]


<b> Testing on iris_with_empties.csv </b>

In [54]:
iris_df = pd.read_csv("iris_with_empties.csv")
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,species
0,,3.5,setosa
1,4.9,3.0,setosa
2,4.7,3.2,
3,4.6,3.1,setosa
4,5.0,3.6,setosa


In [58]:
# test the function on sepal_length
fillEmpty(iris_df,0,1) # prediction problem

Predicting...

[[ 5.58385052]
 [ 5.5814514 ]
 [ 5.59344702]
 [ 5.5814514 ]
 [ 5.57425403]
 [ 5.867928  ]
 [ 6.18319407]
 [ 6.18799232]
 [ 6.1759967 ]]


In [59]:
# test the function on species
fillEmpty(iris_df,2,0) # classification problem

Classifying...

Training score:  0.914285714286
[ 0.  0.  0.  0.  1.  0.  1.  2.  1.  2.]


<b> Testing on energy_with_empties.csv </b>

In [60]:
energy_df = pd.read_csv("energy_with_empties.csv")
energy_df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84


In [61]:
# test the function on Y1
fillEmpty(energy_df,8,1) # prediction problem

Predicting...

[[ 27.65300887]
 [ 24.87584998]
 [ 25.78440438]
 [ 14.38551765]
 [ 28.78776467]
 [ 30.32084643]
 [ 16.97896861]
 [ 15.43098267]
 [ 26.9657231 ]
 [ 16.18810566]
 [ 28.08441194]
 [ 27.36951089]
 [ 17.69993448]
 [ 17.32018039]
 [ 31.90703585]
 [ 32.93583091]
 [ 28.54489047]
 [ 29.35342327]
 [ 16.12444674]
 [ 18.8320092 ]
 [ 33.38292355]
 [ 19.19249214]]


<b> Testing on diabetes_with_empties.csv </b>

In [6]:
diabetes_df = pd.read_csv("diabetes_with_empties.csv")
diabetes_df.head()

Unnamed: 0,TimesPregnant,glucoseLevel,BP,insulin,BMI,Pedigree,Age,IsDiabetic
0,6,148.0,72,0,33.6,0.627,50.0,1
1,1,,66,0,26.6,0.351,31.0,0
2,8,183.0,64,0,23.3,0.672,,1
3,1,,66,94,28.1,0.167,21.0,0
4,0,137.0,40,168,43.1,2.288,33.0,1


In [8]:
# test the function on glucoseLevel
fillEmpty(diabetes_df,1,1) # prediction problem

Predicting...

[[ 110.22586021]
 [ 111.23619473]
 [ 126.64943274]
 [ 165.81852911]
 [ 139.0678635 ]
 [ 123.74505246]
 [ 110.25503689]
 [ 126.02159883]
 [ 108.0717632 ]
 [ 118.59162567]
 [ 131.44741518]
 [ 108.10021271]
 [ 129.84577199]
 [ 129.429291  ]
 [ 109.19763176]
 [ 130.82853592]
 [ 132.6827452 ]
 [ 120.15041996]
 [ 140.95280942]
 [ 133.10629116]
 [ 109.80068339]
 [ 123.02784283]
 [ 116.63436823]
 [ 134.66353125]
 [ 106.84183967]
 [ 107.16755821]
 [ 114.14224633]
 [ 120.39957272]
 [ 120.45788445]
 [ 130.6438027 ]
 [ 116.08773432]
 [ 118.55309261]
 [ 111.51018416]
 [ 143.31758113]
 [ 118.88380431]
 [ 132.85496529]
 [ 122.27160436]
 [ 116.21901929]
 [ 108.67538893]
 [ 117.79693612]
 [ 116.8858277 ]]


In [9]:
# test the function on age
fillEmpty(diabetes_df,6,0) # classification problem

Classifying...

Training score:  1.0
[ 34.  38.  21.  22.  56.  21.  28.  52.  21.  35.  22.  28.  46.  37.  31.
  21.  30.  33.  21.  52.  31.  43.  28.  22.  33.  23.  22.  28.  49.  22.
  24.  21.  57.  58.  29.  25.  40.  21.  22.  24.  40.]


In [33]:
# new test: testing the same, but for fillEmpty2
fillEmpty2(diabetes_df,'Age',0) # classification problem

Classifying...

Training score:  1.0
[ 36.  33.  25.  22.  67.  21.  28.  26.  25.  58.  22.  22.  46.  37.  22.
  21.  29.  23.  21.  52.  28.  21.  28.  21.  33.  24.  24.  24.  39.  25.
  37.  24.  57.  37.  29.  25.  31.  21.  22.  24.  42.]


In [11]:
# new test: testing the same, but for fillEmptyNew
fillEmptyNew(diabetes_df,'Age',0) # classification problem

Classifying...

Training score:  1.0
     TimesPregnant  glucoseLevel  BP  insulin   BMI  Pedigree   Age  \
0                6    148.000000  72        0  33.6     0.627  50.0   
1                1    121.078404  66        0  26.6     0.351  31.0   
2                8    183.000000  64        0  23.3     0.672  41.0   
3                1    121.078404  66       94  28.1     0.167  21.0   
4                0    137.000000  40      168  43.1     2.288  33.0   
5                5    116.000000  74        0  25.6     0.201  30.0   
6                3     78.000000  50       88  31.0     0.248  26.0   
7               10    115.000000   0        0  35.3     0.134  29.0   
8                2    197.000000  70      543  30.5     0.158  53.0   
9                8    121.078404  96        0   0.0     0.232  54.0   
10               4    110.000000  92        0  37.6     0.191  26.0   
11              10    168.000000  74        0  38.0     0.537  34.0   
12              10    139.000000  80    