# Set up the workspace
* Import required packages
* create function definitions
* import the data

In [None]:
#import required packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import missingno as msno
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [35]:
def map_categories(df, new_column, dictionary, recoded_column, breaks, recoded_expression): 
    df[new_column] = np.nan
    for key, value in dictionary.items():
        matches = df.loc[df[recoded_column] == key]
    #df_final['experienced'][matches.index] = value
        if value >= breaks[0]: 
        #print(key, value)
            df[new_column][matches.index] = recoded_expression[0]
        elif value >= breaks[1] and value < breaks[0]: 
        #print(key, value)
            df[new_column][matches.index] = recoded_expression[1]
        elif value >= breaks[2] and value < breaks[1]:
        #print(key, value)
            df[new_column][matches.index] = recoded_expression[2]
        elif value <breaks[2]: 
        #print(key, value)
            df[new_column][matches.index] = recoded_expression[3]
        else: 
            print('no match')
    return df

In [36]:
def create_dict_counts(df, column):
    dictionary = (df[column].value_counts(dropna=False).to_dict())
    funder_quant = df[column].value_counts(dropna=False).quantile([.25, .5, .75])
    return dictionary, funder_quant


In [37]:
def encode_df(df): 
    obj_df = df.select_dtypes(include=[object])
    obj_df[pd.isnull(obj_df)]  = 'NaN'
    #obj_df.drop(['public_meeting', 'permit'], axis=1, inplace=True)
    #(obj_df.to_csv('C:/Users/renee/Desktop/DSBA/Fall 2018/Group Project/Code/funder.csv'))
    encoded_df = pd.DataFrame(obj_df.apply(LabelEncoder().fit_transform))
    df.drop(obj_df.columns, axis=1, inplace=True)
    new_df = df.join(encoded_df, how='left')
    return(new_df)
#print(training_x)

In [38]:
def encode_dummies(df):
    new_df = df.get_dummies()
    return new_df

In [39]:
#import the training data sets

training_x = pd.read_csv("C:/Users/renee/Desktop/DSBA/Fall 2018/Group Project/Data/TrainingData.csv", index_col="id", 
                         dtype = {'public_meeting': bool, 'permit': bool}) 
#dtype = {'funder': str} )
training_y = pd.read_csv("C:/Users/renee/Desktop/DSBA/Fall 2018/Group Project/Data/TrainingLabels.csv", index_col="id")

#import the test set
test_set = pd.read_csv("C:/Users/renee/Desktop/DSBA/Fall 2018/Group Project/Data/TestData.csv", 
                       dtype = { 'public_meeting': bool, 'permit': bool})

In [40]:
# recorded_by has all same values. Drop because there is no information
training_x.drop('recorded_by', inplace=True, axis=1)
test_set.drop('recorded_by', inplace=True, axis=1)

In [41]:

#print(training_x.head(5))
#print(training_x.shape)
#print(training_x.columns)
#training_x.describe()


In [42]:
'''
print(training_y.head(5))
print(training_y.shape)
'''

'\nprint(training_y.head(5))\nprint(training_y.shape)\n'

# Data Cleaning
* change datetype of date_recorded to correct format 
* replace 0s in the funder column to Unknown
* impute missing values
* recategorize funder and installer into new variables with 4 levels, based on their quartile ranges

In [43]:
# function to clean the data without binning the large categorical variables 
def data_cleaning_full(training_df):
    training_df.loc[:,['date_recorded']] = training_df[['date_recorded']].apply(pd.to_datetime, format='%m/%d/%Y')
    training_df['date_recorded'] = training_df['date_recorded'].dt.strftime('%m%d%Y').astype(float)
    training_df.fillna('Unknown')
    #training_df[['public_meeting', 'permit']].fillna("")
    #training_df[['public_meeting', 'permit']].astype(bool)
    #replace the 0's in the funder column with Unknown. Otherwise, the data type will not be consistent when 
    # trying to perform other operations on the values 
    training_df['funder'].loc[training_df['funder'] == '0'] = 'Unknown'
    return training_df



In [44]:
# function to clean data with binning large cateogrical variables 
def data_cleaning_bins(training_df): 
    df= data_cleaning_full(training_df)
     # create dictionaries of the counts of the objects with a lot of classes
    dict_funder, quants_funder = create_dict_counts(df, 'funder')
    print(quants_funder)
    dict_installer, quants_installer = create_dict_counts(df, 'installer')
    new_df = map_categories(df, 'funder_experience', dict_funder, 'funder', (1356, 720, 455), 
                        ('very experienced', 'experienced', 'some experience', 'little experience' ))
    new_df2 = map_categories(new_df, 'installer_experience', dict_installer, 'installer', (1269, 480, 390), 
                         ('very experienced', 'experienced', 'some experience', 'little experience'))
    return new_df2
    

In [45]:
# check the number of levels in the categorical variables

obj_df = training_x.select_dtypes(include=[object])

for obj in obj_df: 
    print(obj + ": " + str(len(set(obj_df[obj]))))


date_recorded: 356
funder: 1897
installer: 2146
wpt_name: 37400
basin: 9
subvillage: 19288
region: 21
lga: 125
ward: 2092
public_meeting: 3
scheme_management: 13
scheme_name: 2697
permit: 3
extraction_type: 18
extraction_type_group: 13
extraction_type_class: 7
management: 12
management_group: 5
payment: 7
payment_type: 7
water_quality: 8
quality_group: 6
quantity: 5
quantity_group: 5
source: 10
source_type: 7
source_class: 3
waterpoint_type: 7
waterpoint_type_group: 6


In [46]:
df_cleaned = data_cleaning_full(training_x )


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [57]:
def find_frequencies(df, column):
    for column_name in column: 
        col_name = str(column_name+'_counts') 
        #print(col_name)
        counts = pd.DataFrame(df[column_name].value_counts())
        #print(counts.index)
        df = df.merge(counts, how='left', left_on=column_name, right_index=True)
        #print(new_df)
        #df = df.loc[df[col_name] == column_name] = counts
    return(df)

In [77]:
df_freq = find_frequencies(df_cleaned, ('funder', 'installer', 'ward', 'scheme_name'))
df_freq.to_csv("C:/Users/renee/Desktop/DSBA/Fall 2018/Group Project/Data/DataFreq.csv")


In [63]:
#print(df_freq.columns)
#print(df_freq.head(10))

In [74]:
list = ('funder_y', 'installer_y', 'ward_y', 'scheme_name_y')
for column in list:
    print(df_freq.columns)
    df_freq = pd.DataFrame(df_freq.loc[df_freq[column]] != 1)
    print(df_freq.shape)
    #new_df = df_final.drop() for row in df_final if df_final[column].loc[df_final[column] == 1]
    
print(df_freq.shape)
df_freq.to_csv("C:/Users/renee/Desktop/DSBA/Fall 2018/Group Project/Data/DataFreq.csv")

Index(['amount_tsh', 'date_recorded', 'funder_x', 'gps_height', 'installer_x',
       'longitude', 'latitude', 'wpt_name', 'num_private', 'basin',
       'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward_x',
       'population', 'public_meeting', 'scheme_management', 'scheme_name_x',
       'permit', 'construction_year', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group', 'funder_y',
       'installer_y', 'ward_y', 'scheme_name_y'],
      dtype='object')
(0, 42)
Index(['amount_tsh', 'date_recorded', 'funder_x', 'gps_height', 'installer_x',
       'longitude', 'latitude', 'wpt_name', 'num_private', 'basin',
       'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward_x',
       'population', 'public_mee

# Use Label Encoded to Encode Categorical Variables

In [78]:
def label_encoding(df, original): 
# dropping public_meeting. getting error 
#"'<' not supported between instances of 'str' and 'bool'", 'occurred at index public_meeting')
#TO DO: research how to correct this. Data type is still mixed instead of bool which is causing error 
    temp_df = df.drop(['public_meeting', 'permit'], axis=1)
    df_encoded = encode_df(temp_df)
    df_encoded['public_meeting'] = original['public_meeting'].values
    df_encoded['permit'] = original['permit'].values
    return df_encoded

In [79]:
df_encoded = label_encoding(df_freq, training_x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._where(-key, value, inplace=True)


# Impute Missing Values

In [80]:
# NOTE: dropping the data_recorded for the imputation right now because otherwise column is throwing an error
# TO DO: figure out why date format isn't working in imputation--> convert to UTC? 

#df_encoded.drop('date_recorded', inplace=True, axis=1) 
from sklearn.preprocessing import Imputer
def impute_missing(df):
    imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
    df_imputed = pd.DataFrame(imp.fit_transform(df), columns=df.columns)
    return df_imputed



In [81]:
df_imputed = impute_missing(df_encoded)

# Use Random Forest to Predict Values coded as 0 that are "unknown" or missing
* longitude 
* construction_year

In [82]:
from sklearn.ensemble import RandomForestRegressor

def impute_falsevalues(df, column): 
    test_y_long = df[[column]].loc[df[column] == 0]
    test_x_long = df.loc[df[column] ==0]
    training_x_long = df.loc[df[column] != 0]
    training_y_long = df[column].loc[df[column] != 0]
    training_x_long.drop(column, inplace=True, axis=1)
    rf_regressor = RandomForestRegressor(n_estimators = 1000, max_depth = 10, 
                                     bootstrap=True, oob_score=True, random_state=1029, n_jobs=-2)
    rf_regressor.fit(training_x_long, training_y_long)
    #print(test_x_long.reshape(-1,1))
    test_x_long.drop(column, inplace=True, axis=1)
    #print(test_x_long)
    long_pred = rf_regressor.predict(test_x_long)
    df.loc[test_x_long.index, column] = long_pred
    return(df)

In [83]:
df_final = impute_falsevalues(df_imputed, 'longitude')
df_final = impute_falsevalues(df_final, 'construction_year')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [None]:
print(df_final.columns)
print(df_final.shape)

# Train test split

In [84]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_final, training_y, test_size=0.20, random_state=1234)

In [85]:
y = np.ravel(y_train)
y_test = np.ravel(y_test)

# Train AdaBoost Model (with Random Forest Base Classifier)

In [110]:

from sklearn.metrics import confusion_matrix
def run_AdaBoost(X_train, y_train, X_test, y_test):  
    ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=30,
                           min_impurity_decrease = .0000001, random_state=0),
    n_estimators=5000,
    learning_rate=.1,
    algorithm="SAMME.R", 
    )

    ada_clf.fit(X_train, y_train)
    preds = ada_clf.predict(X_test)
    accuracy = (accuracy_score(preds, y_test))
    confusion = confusion_matrix(preds, y_test)
    model = ada_clf
    return (accuracy, confusion, model)

In [111]:
#y = np.ravel(training_y)
accuracy, confusion, model = run_AdaBoost(X_train, y, X_test, y_test)
print("accuracy: " + str(accuracy))
print(confusion)
    

accuracy: 0.797053872053872
[[5842  478 1151]
 [ 114  253   51]
 [ 485  132 3374]]


# Gradient Boosting Model

In [32]:
from sklearn.ensemble import GradientBoostingClassifier
def run_gradboost(X_train, y_train, X_test, y_test):
    gb = GradientBoostingClassifier(n_estimators=5000, learning_rate=.1, max_depth=30, 
                                    random_state=1234, max_features=.75).fit(X_train, y_train)
    preds = gb.predict(X_test)
    accuracy = (accuracy_score(preds, y_test))
    confusion = confusion_matrix(preds, y_test)
    model = gb
    return (accuracy, confusion, model)

In [33]:
#y = np.ravel(training_y)
accuracy_gb, confusion_gb, model_gb = run_gradboost(X_train, y, X_test, y_test)
print("accuracy: " + str(accuracy_gb))
print(confusion_gb)

accuracy: 0.9617508417508418
[[31479   416   872]
 [  179  3766    69]
 [  601   135 21883]]


In [30]:
print("accuracy: " + str(accuracy_gb))
print(confusion_gb)

NameError: name 'accuracy_gb' is not defined

# Random Forest Model

In [None]:
def run_randomforest(X_train, y_train, X_test, y_test):
    rf = RandomForestClassifier(n_estimators=5000, learning_rate=.1, max_depth=30, min_impurity_decrease = .000001 
                                    random_state=1234, max_features=.75).fit(X_train, y_train)
    preds = rf.predict(X_test)
    accuracy = (accuracy_score(preds, y_test))
    confusion = confusion_matrix(preds, y_test)
    model = rf
    return (accuracy, confusion, model)

In [None]:
accuracy_rf, confusion_rf, model_rf = run_gradboost(X_train, y, df_final, training_y)
print("accuracy: " + str(accuracy_rf))
print(confusion_rf)

# Support Vector Machine Model

In [None]:
from sklearn.svm import SVC
def run_svm(X_train, y_train, X_test, y_test):
    svm = SVC(class_weight='balanced', n_jobs=-2).fit(X_train, y_train)
    preds = svm.predict(X_test)
    accuracy = (accuracy_score(preds, y_test))
    confusion = confusion_matrix(preds, y_test)
    model = svm
    return (accuracy, confusion, model)

In [None]:
y = np.ravel(training_y)
accuracy, confusion, model_svm = run_svm(X_test, y_test, df_final, y)
print("accuracy: " + str(accuracy))
print(confusion)

# Perform 5-fold Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_test, y_test, cv=5)
print(scores)

# Predict test set and create submission csv

In [None]:
test_set.describe()

In [90]:
df_test_cleaned = data_cleaning_full(test_set)
df_test_freq = find_frequencies(df_test_cleaned, ('funder', 'installer', 'ward', 'scheme_name'))
list = ('funder_y', 'installer_y', 'ward_y', 'scheme_name_y')
#for column in list:
   # df_test_freq = df_test_freq[column] != 1
df_test_encoded = label_encoding(df_test_freq, test_set)
df_test_imputed = impute_missing(df_test_encoded)
df_test_final = impute_falsevalues(df_test_imputed, 'longitude')
df_test_final = impute_falsevalues(df_test_final, 'construction_year')

# drop some columns we know we won't use for model 
#df_final_test = df_imputed_test.drop(['funder', 'installer'], axis=1)
#print(df_final_test.columns)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._where(-key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is tryin

In [91]:
submission_df = test_set[['id']]
df_test_final.drop('id', inplace=True, axis=1)


In [106]:
#list1 = list(df_final_test.index)
#index = np.reshape(list1, len(list1)).T
preds_test = model.predict(df_test_final)
print(preds_test.shape)
list2 = list(preds_test)
final_preds = np.reshape(list2, len(list2)).T
submission_df = test_set[['id']]
submission_df['status_group'] = final_preds
             
     

(14850,)


TypeError: 'tuple' object is not callable

In [97]:
submission_df.to_csv("C:/Users/renee/Desktop/DSBA/Fall 2018/Group Project/Data/Submission_AdaBoost.csv")


In [96]:
print(preds_test.shape)
#list2 = list(preds_test)
final_preds = np.reshape(preds_test, len(preds_test)).T
submission_df = test_set[['id']]
submission_df['status_group'] = final_preds

(14850,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


# Code below is from other tests. Not used in current model. 

# Dummy Encoding 
Instead of Label Encoding 

In [None]:
df_dummy = encode_dummies(df_new2)
accuracy2, confusion2 = run_AdaBoost(df_dummy)
print("accuracy: " + str(accuracy2))
print(confusion2)
    

#  Grid Search for Hyper-parameter tuning


In [None]:
'''
{'base_estimator__max_depth': 11, 'learning_rate': 0.1, 'n_estimators': 2501}
'''

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

tuned_parameters = [{'n_estimators': range(1,4001, 500), 'learning_rate': [.1, 1.6, .3],
                     'base_estimator__max_depth': range(1,21,5), 
                    'base_estimator__max_features': ['auto', 'sqrt','log2']
                    }]

adaboost_tuned = GridSearchCV(AdaBoostClassifier(RandomForestClassifier(class_weight='balanced')), 
                              tuned_parameters, cv=5, 
                       error_score=0, n_jobs=-2)
adaboost_tuned.fit(X_train, y)

print(adaboost_tuned.best_params_)

In [None]:
#gb.get_params().keys

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier

tuned_parameters = [{'n_estimators': range(1,3000, 500), 'learning_rate': [.1, 1.5, .3],
                     'max_depth': range(1,20,5), 
                   'max_features': ['auto', 'sqrt','log2']}]

gb_tuned = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=10, 
                       error_score=0)
gb_tuned.fit(X_train, y)
-k
print(gb_tuned.best_params_)

In [None]:
#print(encoded_df['installer'].describe())

In [None]:
print(preds_test)

In [None]:
gb_preds = gb.predict(df_final_test)
final_preds_gb = np.reshape(list2, len(list2)).T
submission_df = test_set[['id']]
submission_df['status_group'] = final_preds_gb
submission_df.index = submission_df['id']