In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Read Training Set Data
Create a copy of training set feature data (named as `train_X`), and a copy of training set label data (named as `train_y`).

In [2]:
X = pd.read_csv('training_set_values.csv')
y = pd.read_csv('training_set_labels.csv')
train_X = X.copy()
train_y = y.copy().drop(['id'], axis = 1)

# Data Preparation Pipeline
The full pipeline contains the following components (in order):<br>
- Select features that will be included in the model based on the results from exploratory data analysis
- Clean `construction_year` by replace value '0' to missing data
- Engineer `installer` by keeping top 10 most frequent values and replacing the rest to 'other'
- Stardardize numerical features
- Impute the most frequent value for missing data for categorical features
- Apply One-hot coding to categorical features

In [3]:
# Function to Select Features
def select_features(data):
    list_to_drop = ['id', 'date_recorded', 'funder', 'wpt_name', 'num_private', 'subvillage', 'region', 'region_code',
                    'district_code', 'lga', 'ward', 'scheme_management', 'recorded_by', 'scheme_name', 'extraction_type', 'extraction_type_group', 
                    'payment', 'water_quality', 'quantity_group', 'source', 'waterpoint_type']
    data = data.drop(list_to_drop, axis = 1)
    return data

In [4]:
# Function to Clean construction_year Feature
def clean_year(data):
    median_construction_year = data['construction_year'].median()
    data = data.replace({'construction_year': {0: median_construction_year}}) 
    return data

In [5]:
# Function to Engineer installer
def engineer_installer(data):
    data = data.replace({'installer': {'0': 'NaN'}}) 
    list_to_df = data['installer'].value_counts().reset_index()
    df_to_list = list_to_df['index'].to_list()
    df_to_list.remove('NaN')
    for i in range(0, len(data['installer'])):
        if data.iloc[i]['installer'] not in df_to_list[:10]:
            data.loc[i:i, 'installer'] = 'Other'
    return data

In [6]:
# Function to Prepare train_X (i.e., full pipeline)
def prepare_data(data):
    data = engineer_installer(clean_year(select_features(data)))
    data_num = data.select_dtypes(exclude = 'object')
    num_pipeline = Pipeline([('std_scaler', StandardScaler()),])
    data_cat = data.select_dtypes(include = 'object')
    cat_pipeline = Pipeline([('imputer', SimpleImputer(missing_values = np.nan, strategy = "most_frequent")),
                             ('ohe', OneHotEncoder()),])
    num_attribs = list(data_num) 
    cat_attribs = list(data_cat)
    full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs), 
                                       ("cat", cat_pipeline, cat_attribs),]) 
    data_prepared = full_pipeline.fit_transform(data)
    return data_prepared

Prepare `train_X` by using full pipeline, and prepare `train_y` by applying `LabelEncoder`.

In [7]:
train_X = prepare_data(train_X)
le = LabelEncoder()
train_y = le.fit_transform(train_y)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  y = column_or_1d(y, warn=True)


# Model Training
Train a few dirty and quick model with standard parameter, and based on its results to decide which model should be fine tuned and used. Six classifiers are trained and cross validated using function `cross_val`. Each model's average accurary score and its standard deviation are printed using function `display_scores`.

In [8]:
def cross_val(classifier, X, y):
    clf = classifier
    clf.fit(X, y)
    clf_scores = cross_val_score(clf, X, y, cv = 10, n_jobs = -1)
    avg_accuracy = clf_scores.mean()
    std_accuracy = clf_scores.std()
    return avg_accuracy, std_accuracy

def display_scores(classifier_name, avg_accuracy, std_accuracy):
    print(classifier_name)
    print('Mean: ', avg_accuracy)
    print('Standard Deviation: ', std_accuracy)

In [9]:
avg_accuracy, std_accuracy = cross_val(LinearSVC(random_state = 42, tol = 1e-5, dual = False), train_X, train_y)
display_scores('Linear Support Vector Classification', avg_accuracy, std_accuracy)

avg_accuracy, std_accuracy = cross_val(linear_model.SGDClassifier(max_iter=1000, tol=1e-3, n_jobs = -1, random_state = 42), train_X, train_y)
display_scores('Linear classifiers with SGD training', avg_accuracy, std_accuracy)

avg_accuracy, std_accuracy = cross_val(GaussianNB(), train_X.toarray(), train_y)
display_scores('Gaussian Naive Bayes', avg_accuracy, std_accuracy)

avg_accuracy, std_accuracy = cross_val(BernoulliNB(), train_X.toarray(), train_y)
display_scores('Naive Bayes classifier for multivariate Bernoulli models', avg_accuracy, std_accuracy)

avg_accuracy, std_accuracy = cross_val(DecisionTreeClassifier(random_state = 42), train_X, train_y)
display_scores('Decision Tree Classifier', avg_accuracy, std_accuracy)

avg_accuracy, std_accuracy = cross_val(RandomForestClassifier(n_estimators = 100, max_depth = 2, random_state = 42), train_X, train_y)
display_scores('Random Forest Classifier', avg_accuracy, std_accuracy)

Linear Support Vector Classification
Mean:  0.7303032296036772
Standard Deviation:  0.005327665724590623
Linear classifiers with SGD training
Mean:  0.724748167060541
Standard Deviation:  0.006365990081275285




Gaussian Naive Bayes
Mean:  0.3025254793896805
Standard Deviation:  0.014991231967817192
Naive Bayes classifier for multivariate Bernoulli models
Mean:  0.6589565905558069
Standard Deviation:  0.005561560049406029
Decision Tree Classifier
Mean:  0.7545449289397614
Standard Deviation:  0.005620641720731255
Random Forest Classifier
Mean:  0.6899162214570166
Standard Deviation:  0.00519800956212946


From the results above, Decision Tree Classifier should be further fine tuned. Random Forest Classifier usually performs better than Decisision Tree Classifier (although it is not the case here), thus Random Forest Classifier will be fine tuned as well.<br>

Using `grid_search` to fine tune the classifiers mentioned above. For Decision Tree Classifier, `min_sample_split` is adjusted to regulate the classifier, and `max_features` is adjusted to check whether the number of features used in the classifier is appropriate.

In [10]:
param_grid_tree = [{'min_samples_split': [6, 8, 10], 'max_features': ['auto', 'log2', 'sqrt']},] 
tree = DecisionTreeClassifier(random_state = 42) 
grid_search_tree = GridSearchCV(tree, param_grid_tree, cv = 5, 
                                scoring = 'accuracy', return_train_score = True) 
grid_search_tree.fit(train_X, train_y)
print('Best Parameter: ', grid_search_tree.best_params_)
print('Best Estimator:', grid_search_tree.best_estimator_)
print('Best Score: ', grid_search_tree.best_score_)

Best Parameter:  {'max_features': 'log2', 'min_samples_split': 10}
Best Estimator: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
Best Score:  0.7601515151515151


For Random Forest Classifier, `n_estimators` is adjusted to determine what number of the trees in the model renders the best accuracy score. 

In [11]:
param_grid_forest = [{'n_estimators': [10, 100, 300]}, {'max_features': ['auto']},] 
forest = RandomForestClassifier(random_state = 42) 
grid_search_forest = GridSearchCV(forest, param_grid_forest, cv = 5, 
                                  scoring = 'accuracy', return_train_score = True) 
grid_search_forest.fit(train_X, train_y)
print('Best Parameter: ', grid_search_forest.best_params_)
print('Best Estimator:', grid_search_forest.best_estimator_)
print('Best Score: ', grid_search_forest.best_score_)



Best Parameter:  {'n_estimators': 300}
Best Estimator: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
Best Score:  0.7994781144781145


Based on the results, Random Forest classifier out performs Decision Tree Classifier. Save the fine tuned Random Forest Classifier.

In [12]:
SaveClassifier = open("RandomForest.pickle","wb")
pickle.dump(grid_search_forest.best_estimator_, SaveClassifier)
SaveClassifier.close()

# Prediction
Use the fine tuned Random Forest Classifier to predict the label for test set (i.e., the functionality of the water well based on information provided).

In [13]:
final_model = grid_search_forest.best_estimator_
test_X = pd.read_csv('testing_set_values.csv')
test_X = prepare_data(test_X)
final_predictions = final_model.predict(test_X) 

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Save the prediction to the `SubmissionFilled.csv` file.

In [15]:
final_predictions = le.inverse_transform(final_predictions)
test_y = pd.read_csv('Submission.csv')
test_y['status_group'] = final_predictions
test_y.to_csv('SubmissionFilled.csv' ,index = False)

Scored accuracy rate 79.57% on test set according to the competition site.