In [None]:
#import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from yellowbrick.classifier import ConfusionMatrix, ROCAUC
from yellowbrick.style import set_palette

%matplotlib inline

In [None]:
#getting data and creating appropriate target labels

X = pd.read_csv('./Desktop/MetisProjects/Project_3/Tanzania_train_X.csv')
y = pd.read_csv('./Desktop/MetisProjects/Project_3/Tanzania_train_y.csv')
df = X.merge(y,on='id')

dummy_targets.columns = ['id', 'functional', 'functional_needs_repairs', 'non_functional']
dummy_targets = dummy_targets[dummy_targets.columns[1:]]
concat_train_data = pd.concat([X,dummy_targets], axis=1)

In [None]:
df.info()
df[['longitude', 'latitude', 'construction_year']].dropna()

In [None]:
#reduce the number of values in dummy columns while preserving most of the information
def funder_wrangler(row):  
    '''Keep top 5 values and set the rest to 'other'''

    if row['funder']=='Government Of Tanzania':
        return 'gov'
    elif row['funder']=='Danida':
        return 'danida'
    elif row['funder']=='Hesawa':
        return 'hesawa'
    elif row['funder']=='Rwssp':
        return 'rwssp'
    elif row['funder']=='World Bank':
        return 'world_bank'    
    else:
        return 'other'
    
df['funder'] = df.apply(lambda row: funder_wrangler(row), axis=1)


#added status group_vals so that pivot table can be used to break down functionality of pumps based on funder
vals_to_replace = {'functional':2, 'functional needs repair':1,
                   'non functional':0}

df['status_group_vals']  = df.status_group.replace(vals_to_replace)

piv_table = pd.pivot_table(df,index=['funder','status_group'],
                           values='status_group_vals', aggfunc='count')
piv_table



In [None]:
#same idea as above reduce categories for one hot encoding

def installer_wrangler(row):
    '''Keep top 5 values and set the rest to 'other'''
    if row['installer']=='DWE':
        return 'dwe'
    elif row['installer']=='Government':
        return 'gov'
    elif row['installer']=='RWE':
        return 'rwe'
    elif row['installer']=='Commu':
        return 'commu'
    elif row['installer']=='DANIDA':
        return 'danida'
    else:
        return 'other'  

df['installer'] = df.apply(lambda row: installer_wrangler(row), axis=1)

df = df.drop('subvillage', axis=1)
df.public_meeting = df.public_meeting.fillna('Unknown')

In [None]:
def scheme_wrangler(row):
    '''Keep top 5 values and set the rest to 'other'. '''
    if row['scheme_management']=='VWC':
        return 'vwc'
    elif row['scheme_management']=='WUG':
        return 'wug'
    elif row['scheme_management']=='Water authority':
        return 'wtr_auth'
    elif row['scheme_management']=='WUA':
        return 'wua'
    elif row['scheme_management']=='Water Board':
        return 'wtr_brd'
    else:
        return 'other'

df['scheme_management'] = df.apply(lambda row: scheme_wrangler(row), axis=1)

#lot of categories with no clear top dogs. Probably safe to drop.
df = df.drop('scheme_name', axis=1)
df.permit = df.permit.fillna('Unknown')
df.isnull().sum()

In [None]:
str_cols = df.select_dtypes('object')
str_cols.apply(lambda x: len(x.unique()))

df.date_recorded = pd.to_datetime(df.date_recorded)
df.date_recorded.describe()

df.date_recorded = pd.datetime(2013, 12, 3) - pd.to_datetime(df.date_recorded)
df.columns = ['days_since_recorded' if x=='date_recorded' else x for x in df.columns]
df.days_since_recorded = df.days_since_recorded.astype('timedelta64[D]').astype(int)
df.days_since_recorded.describe()



In [None]:
#checking the different basins to see if any stand out for being nonfunctional
piv_table = pd.pivot_table(df, index=['basin', 'status_group'],
                           values=['status_group_vals'], aggfunc='count')
piv_table

In [None]:
# highly correlated with each other. I'll drop then for now.
# They could be be worth including though, so might come back to them.
df = df.drop(['region', 'lga', 'ward'], axis=1)


df = df.drop('wpt_name', axis=1) #no relevant information
df = df.drop('recorded_by', axis=1) # no relevant information
df = df.drop(['extraction_type', 'extraction_type_group'], axis=1) # similar information
df = df.drop('management', axis=1) # redundant information
df = df.drop('management_group', axis=1) # redundant information
df = df.drop('payment', axis = 1) # redundant information
df = df.drop('quality_group', axis = 1) #redundant information
df = df.drop('quantity_group', 1)
df = df.drop('source', 1)
df = df.drop(['gps_height', 'longitude', 'latitude', 'region_code', 'district_code','num_private', 'id'], axis=1)
df = df.drop('status_group_vals', 1)




In [None]:
def construction_wrangler(row):
    if row['construction_year'] >= 1960 and row['construction_year'] < 1970:
        return '60s'
    elif row['construction_year'] >= 1970 and row['construction_year'] < 1980:
        return '70s'
    elif row['construction_year'] >= 1980 and row['construction_year'] < 1990:
        return '80s'
    elif row['construction_year'] >= 1990 and row['construction_year'] < 2000:
        return '90s'
    elif row['construction_year'] >= 2000 and row['construction_year'] < 2010:
        return '00s'
    elif row['construction_year'] >= 2010:
        return '10s'
    else:
        return 'unknown'
    
df['construction_year'] = df.apply(lambda row: construction_wrangler(row), axis=1)

df.construction_year.value_counts()

df.drop('status_group_vals', axis=1)

df.to_csv('./Desktop/project3_ready_to_model.csv', index=False )

In [None]:
df = pd.read_csv('./Desktop/project3_ready_to_model.csv')

In [None]:
dummy_cols = ['funder', 'installer', 'basin', 'public_meeting', 'scheme_management', 'permit',
              'construction_year', 'extraction_type_class', 'payment_type', 'water_quality',
              'quantity', 'source_type', 'source_class', 'waterpoint_type',
             'waterpoint_type_group']

df = pd.get_dummies(df, columns = dummy_cols)
df.status_group=df.status_group.apply(lambda row: 2 if row == 'functional' else 1 if row == 'non functional' else 0)

In [None]:
X = df.drop('status_group', axis=1)
y = df.status_group


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y)


print(X_train.shape, y_train.shape)

In [None]:
#see which classifier work best by default and take it from there

clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred =clf.predict(X_test)
clf.score(X_test, y_test)

print(confusion_matrix(y_test, y_pred))

mod = LogisticRegression()
mod.fit(X_train, y_train)
y_pred = mod.predict(X_test)
mod.score(X_test, y_test)

print(confusion_matrix(y_test, y_pred))

kNN = KNeighborsClassifier()
kNN.fit(X_train, y_train)
y_pred = kNN.predict(X_test, y_test)
kNN.score(X_test, y_test)

print(confusion_matrix(y_test, y_pred))

In [None]:
LinearSVC().get_params()

pipe_svc = Pipeline([('scl', StandardScaler()),
                     ('clf', LinearSVC())])

param_grid = {'clf__C':[0.001, 0.01, 0.1, 1.0],
              'clf__class_weight':[None, 'balanced']}

estimator = GridSearchCV(estimator=pipe_svc,
                         param_grid=param_grid)

best_params = estimator.best_params_

validation_accuracy = estimator.score(X_test, y_test)
print('Validation accuracy: ', validation_accuracy)
print(best_params)


In [None]:
GradientBoostingClassifier().get_params()

gbm = GradientBoostingClassifier()

param_grid = {'learning_rate': np.linspace(0.01, .1, 10),
              'max_depth': [10, 15],
              'min_samples_leaf':np.linspace(10,20, 10),
              'max_features': [1.0],
              'n_estimators': [100,200],
              }

estimator = GridSearchCV(gbm, param_grid=param_grid, n_jobs=-1)

estimator.fit(X_train, y_train)

best_params = estimator.best_params_
print(best_params)

test_accuracy = estimator.score(X_test, y_test)

print('validation accuracy:', test_accuracy)

In [None]:
gbm = GradientBoostingClassifier(learning_rate= 0.07, max_depth=14, min_samples_leaf=16, max_features=1.0, n_estimators = 100)

gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)

test_accuracy = gbm.score(X_test, y_test)

print('Validation accuracy:', test_accuracy)
confusion_matrix(y_test, y_pred)



In [None]:
plt.rcParams['figure.figsize'] = (20,10)

from yellowbrick.classifier import ClassPredictionError

# Instantiate the classification model and visualizer
visualizer = ClassPredictionError(gbm, classes=[0,1,2])

# Fit the training data to the visualizer
visualizer.fit(X_train, y_train)

# Evaluate the model on the test data
visualizer.score(X_test, y_test)

# Draw visualization
visualizer.poof(outpath='./Desktop/Class_prediction.png')

In [None]:
visualizer = ROCAUC(gbm)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.poof(outpath='./Desktop/ROC.png')


In [None]:
cm=ConfusionMatrix(gbm)
cm.score(X_test, y_test)
for label in cm.ax.texts:
    label.set_size(16)
cm.poof('./Desktop/Confus.png')

In [None]:
from yellowbrick.features.importances import FeatureImportances

feature_importances=pd.DataFrame(columns=['feature','importance'])

feature_importances['feature']= X_train.columns

feature_importances['importance']=gbm.feature_importances_

feature_importances= feature_importances.set_index('feature')

feature_importances['importance'].sort_values(ascending = False).head(5).plot(kind='barh')

In [None]:
#diagnostics
feature_importance = list(gbm.feature_importances_)
column_names = list(X.columns)

zipped = list(zip(column_names, feature_importance))

sorted(zipped, key= lambda item:item[1], reverse=True)[:10]

In [None]:
df.status_group.replace({1:'non-functional', 0:'needs_repairs', 2:'functional'}, inplace=True)
df.population.describe()
df.status_group[(df.days_since_recorded > 1400)].value_counts()
df.status_group[(df.population > df.population.mean() + 2 * df.population.std())].value_counts() #2sds above and below
df.status_group[(df.amount_tsh > df.amount_tsh.mean() + 2 * df.population.std())].value_counts() #2sds above and below

df.status_group[df.quantity_dry == 1].value_counts()

In [None]:
best_params = {'learning_rate': [0.07],
                        'max_depth': [14],
                        'min_samples_leaf': [16],
                        'max_features': [1.0],
                        'n_estimators': [100]}                      




estimator = GridSearchCV(estimator=GradientBoostingClassifier(),
                         param_grid=best_params,
                         n_jobs=-1)

estimator.fit(X, y_train)     

test_accuracy = estimator.score(X_test, y_test)

print('validation accuracy:', test_accuracy)