In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
train = pd.read_csv('../Data/train.csv')
train.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0,5,8,5,8,6,4,4,3,3,...,5,3,3,5,4,7,5,7,3,0.445
1,1,6,7,4,4,8,8,3,5,4,...,7,2,0,3,5,3,3,4,3,0.45
2,2,6,5,6,7,3,7,1,5,4,...,7,3,7,5,6,8,2,3,3,0.53
3,3,3,4,6,5,4,8,4,7,6,...,2,4,7,4,4,6,5,7,5,0.535
4,4,5,3,2,6,4,4,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415


In [3]:
train.isnull().sum()

id                                 0
MonsoonIntensity                   0
TopographyDrainage                 0
RiverManagement                    0
Deforestation                      0
Urbanization                       0
ClimateChange                      0
DamsQuality                        0
Siltation                          0
AgriculturalPractices              0
Encroachments                      0
IneffectiveDisasterPreparedness    0
DrainageSystems                    0
CoastalVulnerability               0
Landslides                         0
Watersheds                         0
DeterioratingInfrastructure        0
PopulationScore                    0
WetlandLoss                        0
InadequatePlanning                 0
PoliticalFactors                   0
FloodProbability                   0
dtype: int64

Bench Mark Model

In [4]:
X = train.drop(columns=['id','FloodProbability'])
y = train['FloodProbability']

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [6]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.8450306764524692

In [7]:
lr.score(X_val, y_val)

0.8447057763250218

StandardScalar model

In [8]:
sc = StandardScaler()
X_trainsc = sc.fit_transform(X_train)
X_valsc = sc.transform(X_val)


In [9]:
lr2 = LinearRegression()
lr2.fit(X_trainsc, y_train)
lr2.score(X_trainsc, y_train)

0.8450306764524692

In [10]:
lr2.score(X_valsc, y_val)

0.8447057763250218

Poly and standard scalar

In [11]:
poly = PolynomialFeatures(degree=2, include_bias=False)

In [12]:
X_poly = poly.fit_transform(X)
X_poly = pd.DataFrame(X_poly)

scpoly = StandardScaler()
X_polysc = scpoly.fit_transform(X_poly)

In [13]:
X1_train, X1_val, y1_train, y1_val = train_test_split(X_polysc, y, random_state=42)

In [14]:
lr3 = LinearRegression()
lr3.fit(X1_train, y1_train)
lr3.score(X1_train, y1_train)

0.8456372703227519

In [15]:
lr3.score(X1_val, y1_val)

0.8452910562909939

In [16]:
from sklearn.linear_model import RidgeCV

In [17]:
%%time
r_alphas = np.logspace(-2, 5, 100)
ridge_cv = RidgeCV(alphas=r_alphas, cv=10)
ridge_cv.fit(X1_train, y1_train)

CPU times: user 2h 42min 13s, sys: 17min 18s, total: 2h 59min 31s
Wall time: 29min 9s


In [18]:
ridge_cv.best_score_

0.8455431918042

In [19]:
ridge_cv.score(X1_train, y1_train)

0.8456372597571806

In [20]:
ridge_cv.score(X1_val, y1_val)

0.8452912364916261

In [21]:
X = train.drop(columns=['id','FloodProbability'])
y = train['FloodProbability']

In [22]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [23]:
%%time
r_alphas = np.logspace(-2, 5, 50)
ridge_cv = RidgeCV(alphas=r_alphas, cv=10)
ridge_cv.fit(X_train, y_train)

CPU times: user 3min 57s, sys: 2min 33s, total: 6min 30s
Wall time: 52.3 s


In [24]:
ridge_cv.score(X_train, y_train)

0.8450306764423855

In [25]:
ridge_cv.score(X_val, y_val)

0.844705777074947

Random Forest

In [26]:
rf = RandomForestRegressor()
rf.fit(X_trainsc, y_train)
rf.score(X_trainsc, y_train), rf.score(X_valsc, y_val)

(0.9510270342731438, 0.6541642438491279)

Lasso

In [27]:
las = Lasso(max_iter=1000,alpha=0.1)
las.fit(X_train, y_train)
las.score(X_train, y_train), las.score(X_val, y_val)

(0.0, -6.036083950444748e-07)

In [28]:
las1 = Lasso(max_iter=1000, alpha=0.1)
las1.fit(X_trainsc, y_train)
las1.score(X_trainsc, y_train), las1.score(X_valsc, y_val)

(0.0, -6.036083950444748e-07)

In [29]:
las = Lasso(max_iter=1000,alpha=0.1)
las.fit(X1_train, y_train)
las.score(X1_train, y_train), las.score(X1_val, y_val)

(0.0, -6.036083950444748e-07)

Random Forest with GradBoost and Random Search

In [33]:
params = {
    'max_features': np.arange(5, X.shape[1] + 1),
    'max_depth': np.append(np.arange(1, 50), None),
    'min_samples_leaf': [2, 3],   
    'n_estimators': [50, 100, 150]
}

rf = GradientBoostingRegressor(random_state = 42)

rf = RandomizedSearchCV(rf, params, n_iter=10, cv = 5, n_jobs = -1)

In [34]:
%%time
rf.fit(X_train, y_train)



CPU times: user 2min 54s, sys: 1.01 s, total: 2min 55s
Wall time: 1h 10min 39s


In [35]:
rf.score(X_train, y_train), rf.score(X_val, y_val)

(0.9939100754819915, 0.7208527211491504)

In [36]:
rf.best_params_

{'n_estimators': 100,
 'min_samples_leaf': 2,
 'max_features': 5,
 'max_depth': 19}

In [47]:
params2 = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'max_features': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
    'min_samples_leaf': [2, 3],
    'n_estimators': [50, 100, 150]
}

rf2 = GradientBoostingRegressor(random_state = 42)

rf2 = RandomizedSearchCV(rf2, params2, n_iter=10, cv = 5, n_jobs = -1)

In [48]:
%%time
rf2.fit(X_trainsc, y_train)

CPU times: user 3min 58s, sys: 651 ms, total: 3min 58s
Wall time: 21min 9s


In [49]:
rf2.score(X_trainsc, y_train), rf2.score(X_valsc, y_val)

(0.8602898832793927, 0.8196195663775501)

In [51]:
rf2.best_params_

{'n_estimators': 150,
 'min_samples_leaf': 2,
 'max_features': 9,
 'max_depth': 10}

In [58]:
params3 = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'max_features': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
    'min_samples_leaf': [2, 3],
    'n_estimators': [50, 100, 150]
}

rf3 = GradientBoostingRegressor(random_state = 42)

rf3 = RandomizedSearchCV(rf3, params3, n_iter=10, cv = 5, n_jobs = -1)

In [59]:
%time
rf3.fit(X1_train, y_train)

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 111 µs


python(6570) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6571) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6572) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6573) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6574) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6575) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6576) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6577) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


KeyboardInterrupt: 

In [None]:
rf3.score(X1_train, y_train), rf3.score(X1_val, y_val)