In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from sklearn.model_selection import train_test_split
%matplotlib inline



In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")



In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
train_df=reduce_mem_usage(train_df)
test_df=reduce_mem_usage(test_df)

In [None]:
train_df.describe()

In [None]:
train_df.head()

In [None]:
target = train_df["target"]
train_df = train_df.drop(["target"],axis=1)

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(target,kde=True,color='m')
plt.title("Target Distribution",fontsize=16)
plt.xlabel("Target Freq",fontsize=12)
plt.ylabel("Target",fontsize=12)



Distribution of target is bimodal, its not normal. 

In [None]:
#looks none is correlated well with each other
plt.figure(figsize=(15,15))
#sns.set_palette("bright")
sns.heatmap(train_df[2:].corr(),cmap='viridis')
plt.show()

In [None]:
def plot_feature_importance(model,df):
    df_res =pd.DataFrame({"Features" : df.columns,
                          "Importance" : model.feature_importances_})
    df_res.sort_values(by='Importance', ascending=False, inplace=True)
    df_res = df_res.iloc[:40]
    df_res.set_index("Features",drop=True)
    plt.figure(figsize=(15,12))
    sns.barplot(x= "Features", y = "Importance", data = df_res,orient="v")
    plt.ylabel("Importance")
    plt.xlabel("Features")
  

In [None]:
param_grid = {'n_estimators'     : [1500,2000],
              'max_features' :  [200,150],
             'max_depth'     : [3,5]}

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
shuffle_split = StratifiedShuffleSplit(test_size=0.8,train_size=0.2,n_splits=20)

In [None]:
grid_search = GridSearchCV(
    estimator = RandomForestClassifier(random_state=12,n_jobs=6),
    param_grid = param_grid, 
    cv = shuffle_split,
   )

In [None]:
grid_search.fit(train_df,target)

In [None]:
print("Best parameters : {}".format(grid_search.best_params_))
print("Best cross validation score: {:.2f}".format(grid_search.best_score_))
print("Best estimator: {}".format(grid_search.best_estimator_))


In [None]:
results = pd.DataFrame(grid_search.cv_results_)
results.head()

In [None]:
scores_rf = np.array(results.mean_test_score).reshape(-1)

In [None]:
scores_rf


In [None]:
plot_feature_importance(grid_search.best_estimator_,train_df)

In [None]:
(pd.Series(grid_search.best_estimator_.feature_importances_, index=train_df.columns,)
   .nlargest(20)
   .plot(kind='bar')) 



In [None]:
#type(grid_search.best_estimator_.feature_importances_)

In [None]:
#print( "Predictions on test set {}".format(grid_search.predict(test_df)))
pred_rf = grid_search.predict_proba(test_df)[:,1]


In [None]:
pred_rf[:-5]

In [None]:
#print test file 
sub_df = pd.DataFrame({"id":test_df["id"].values})
sub_df["target"] = pred_rf
sub_df.to_csv("baseline_rf.csv", index=False)

## Lets try LGBM

LGBM needs dataset in its format, 

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df, target,
                                                    test_size=0.20,
                                                    random_state=42,
                                                    stratify=target)

In [None]:
X_train.head()
y_train.head()

In [None]:
# create lgbm datasets , evaluation data set needs a reference to train
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

In [None]:
params = {
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'num_leaves': 5,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}
print('Starting training...')

gbm = lgb.train(params=params,
                train_set=lgb_train,
                num_boost_round=2000,
                valid_sets=lgb_eval,
                early_stopping_rounds=100,
                verbose_eval=3)


In [None]:
print("Best iteration is {}".format(gbm.best_iteration))
print("Best iteration is {}".format(gbm.best_score))

In [None]:
pred_gbm = gbm.predict(test_df, num_iteration=gbm.best_iteration)

In [None]:
pred_gbm[:5]

In [None]:
#print test file 
sub_df = pd.DataFrame({"id":test_df["id"].values})
sub_df["target"] = pred_gbm
sub_df.to_csv("baseline_gbm.csv", index=False)

In [None]:
fig, ax = plt.subplots(figsize=(12,10))
lgb.plot_importance(gbm, max_num_features=100, height=0.8, ax=ax)
ax.grid(False)
plt.title("LightGBM - Feature Importance", fontsize=15)
plt.show()