## "Pump It Up " DRIVEN DATA COMPETITION
Data from Taarifa and the Tanzanian Ministry of Water are used to predict which pumps are functional,
which need some repairs, and which don't work at all? 
https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/


# Import the data

In [1]:
import pandas as pd

In [2]:
dtypes = {'region_code': 'object', 'district_code':'object'}

X_train = pd.read_csv('train_values.csv', 
                      index_col = [0],dtype = dtypes)
X_train.head(3)

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe


In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 69572 to 26348
Data columns (total 39 columns):
amount_tsh               59400 non-null float64
date_recorded            59400 non-null object
funder                   55765 non-null object
gps_height               59400 non-null int64
installer                55745 non-null object
longitude                59400 non-null float64
latitude                 59400 non-null float64
wpt_name                 59400 non-null object
num_private              59400 non-null int64
basin                    59400 non-null object
subvillage               59029 non-null object
region                   59400 non-null object
region_code              59400 non-null object
district_code            59400 non-null object
lga                      59400 non-null object
ward                     59400 non-null object
population               59400 non-null int64
public_meeting           56066 non-null object
recorded_by              59400 non-null o

In [4]:
y_train = pd.read_csv('train_labels.csv', index_col = [0])
y_train = y_train.status_group
y_train.head(3)

id
69572    functional
8776     functional
34310    functional
Name: status_group, dtype: object

In [5]:
dtypes = {'region_code': 'object', 'district_code':'object'}
X_test = pd.read_csv('test_vlaues.csv', 
                      index_col = [0], dtype = dtypes,
                    parse_dates=['date_recorded'],infer_datetime_format=True)
X_test.head(3)

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,Internal,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,Pangani,...,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,Internal,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other


In [6]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14850 entries, 50785 to 68707
Data columns (total 39 columns):
amount_tsh               14850 non-null float64
date_recorded            14850 non-null datetime64[ns]
funder                   13981 non-null object
gps_height               14850 non-null int64
installer                13973 non-null object
longitude                14850 non-null float64
latitude                 14850 non-null float64
wpt_name                 14850 non-null object
num_private              14850 non-null int64
basin                    14850 non-null object
subvillage               14751 non-null object
region                   14850 non-null object
region_code              14850 non-null object
district_code            14850 non-null object
lga                      14850 non-null object
ward                     14850 non-null object
population               14850 non-null int64
public_meeting           14029 non-null object
recorded_by              14850 no

# Create Submssion Pipeline

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import TruncatedSVD

In [8]:
columns = ['gps_height']
ct = ColumnTransformer(remainder='drop', 
                       transformers=[
                           ('select', 'passthrough', columns )])

model_1 = Pipeline([
    ('selector', ct),
    #('predictor', SVC())
    ('predictor', DecisionTreeClassifier())
    
])

In [9]:
model_1.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('selector',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('select', 'passthrough',
                                                  ['gps_height'])],
                                   verbose=False)),
                ('predictor',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                        max_depth=None, max_features=None,
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        pres

In [10]:
model_1.score(X_train,y_train)

0.5714814814814815

In [11]:
y_test_pred = model_1.predict(X_test)

In [12]:
X_test.index[:10]

Int64Index([50785, 51630, 17168, 45559, 49871, 52449, 24806, 28965, 36301,
            54122],
           dtype='int64', name='id')

In [13]:
predictions = pd.Series(data= y_test_pred,
                        index= X_test.index,
                        name= 'status_group')

In [14]:
predictions.head()

id
50785        functional
51630        functional
17168        functional
45559    non functional
49871        functional
Name: status_group, dtype: object

In [15]:
predictions.to_csv('predictions/1st_pred.csv')

  """Entry point for launching an IPython kernel.


In [16]:
def make_submission(model, X_test):
    y_test_pred = model.predict(X_test)
    predictions = pd.Series(data= y_test_pred,
                        index= X_test.index,
                        name= 'status_group')
    date = pd.Timestamp.now().strftime(format ='%Y-%m-%d_%H-%M_')
    predictions.to_csv(f'predictions/{date}submission.csv', index=True, header=True)
    

In [17]:
make_submission(model_1, X_test)

# Model with Numerical Features

In [18]:
X_train.dtypes

amount_tsh               float64
date_recorded             object
funder                    object
gps_height                 int64
installer                 object
longitude                float64
latitude                 float64
wpt_name                  object
num_private                int64
basin                     object
subvillage                object
region                    object
region_code               object
district_code             object
lga                       object
ward                      object
population                 int64
public_meeting            object
recorded_by               object
scheme_management         object
scheme_name               object
permit                    object
construction_year          int64
extraction_type           object
extraction_type_group     object
extraction_type_class     object
management                object
management_group          object
payment                   object
payment_type              object
water_qual

In [19]:
num_feat = X_train.select_dtypes(include = 'number').columns.to_list()
num_feat

['amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'num_private',
 'population',
 'construction_year']

In [20]:
X_train.select_dtypes(include = 'number').describe()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,317.650385,668.297239,34.077427,-5.706033,0.474141,179.909983,1300.652475
std,2997.574558,693.11635,6.567432,2.946019,12.23623,471.482176,951.620547
min,0.0,-90.0,0.0,-11.64944,0.0,0.0,0.0
25%,0.0,0.0,33.090347,-8.540621,0.0,0.0,0.0
50%,0.0,369.0,34.908743,-5.021597,0.0,25.0,1986.0
75%,20.0,1319.25,37.178387,-3.326156,0.0,215.0,2004.0
max,350000.0,2770.0,40.345193,-2e-08,1776.0,30500.0,2013.0


In [21]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

In [22]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values =0, strategy ='mean'))
])
ct = ColumnTransformer(remainder = 'drop',
                      transformers=[('numerical', num_pipe, num_feat)])
##Ct pass the columns to numerical pipeline

model_2 = Pipeline([
    ('ct', ct),
    ('classifier', DecisionTreeClassifier())
])

In [23]:
model_2.fit(X_train, y_train);

In [24]:
model_2.score(X_train, y_train)

0.9841414141414141

In [25]:
make_submission(model_2,X_test)

This model got a score of '0.675' 

# Model  with Numerical and Categorical Features

In [26]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder



In [27]:
cat_feat = X_train.select_dtypes(include='object').columns.to_list()

In [28]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values =0, strategy ='mean')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy ='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown= 'ignore'))
    
])

ct = ColumnTransformer(remainder = 'drop',
                      transformers=[
                          ('numerical', num_pipe, num_feat),
                          ('categorical', cat_pipe, cat_feat)
                      ])
##Ct pass the columns to numerical pipeline

model_3 = Pipeline([
    ('ct', ct),
    ('classifier', DecisionTreeClassifier())
    
])

In [29]:
model_3.fit(X_train, y_train)
print(model_3.score(X_train, y_train))

0.999983164983165


In [30]:
make_submission(model_3, X_test)

score from this model_3 is '0.7875'

As Decision tree improved our model we can try for Random Forest model

In [31]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values =0, strategy ='mean')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy ='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown= 'ignore'))
    
])

ct = ColumnTransformer(remainder = 'drop',
                      transformers=[
                          ('numerical', num_pipe, num_feat),
                          ('categorical', cat_pipe, cat_feat)
                      ])
##Ct pass the columns to numerical pipeline



model_4 = Pipeline([
    ('ct', ct),
    ('classifier', RandomForestClassifier(n_jobs = -1))
    
])

In [33]:
model_4.fit(X_train, y_train)
print(model_4.score(X_train, y_train))

0.9863299663299663


In [27]:
make_submission(model_4, X_test)

score from this model_4 is '0.803'

# Attempt3: Decreasing the number of Features

In [37]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values =0, strategy ='mean')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy ='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown= 'ignore'))
    
])

ct = ColumnTransformer(remainder = 'drop',
                      transformers=[
                          ('numerical', num_pipe, num_feat),
                          ('categorical', cat_pipe, cat_feat)
                      ])
##Ct pass the columns to numerical pipeline



model_5 = Pipeline([
    ('ct', ct),
    ('pca', TruncatedSVD(n_components=30)),
    ('classifier', RandomForestClassifier(n_jobs = -1))
    
])

In [38]:
model_5.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('ct',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numerical',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=0,
                                                                                 strategy='mean',
                                                                     

In [39]:
print(model_5.score(X_train, y_train))

0.9828787878787879


In [41]:
make_submission(model_5, X_test)