### Dataset files downloaded from Kaggle

In [None]:
# Import module feature_selector- it will be used as part of Exploratory Data Analysis
!cp -r ../input/feature-selector/feature-selector/feature-selector-master/feature_selector/* ./
import feature_selector

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from feature_selector import FeatureSelector

### Load training dataset and covert missing data value to `NaN`

In [None]:
train_df = pd.read_csv('../input/porto-seguro-safe-driver-prediction/train.csv', low_memory=False, na_values= '-1')
pd.options.display.max_rows=None
pd.options.display.max_columns = None
train_df.head()

### Check if dataset is balanced

In [None]:
train_df.target.value_counts()

### From the last cell, the target class, shows an imbalanced dataset -  it's clear from the class `0` far exceed class `1` - let's visualize

In [None]:
train_df['target'].value_counts().plot(kind='bar', figsize=(5,5));


### The above plot suggest a further data exploration given that the dataset is not balanced. Let's check feature correlation

In [None]:
# Sample figsize in inches
fig, ax = plt.subplots(figsize=(20,10))         
# Imbalanced DataFrame Correlation
corr = train_df.corr()
sns.heatmap(corr, cmap='RdYlBu', annot_kws={'size':30}, ax=ax)
ax.set_title("Feature Correlation Matrix", fontsize=14)
plt.show()

In [None]:
# Make a copy of data
train_df_copy = train_df.copy()

### Remove Features with little or no correlation either with the target or other features  - Looking at the above correlation matrix, features with label `_calc_` have ~ no correlation with target - let's drop'em

In [None]:
def remove_calc(data_df):
  for label, content in data_df.items():
    if '_calc' in label:
      data_df.drop([label], axis=1, inplace=True)

  return data_df


In [None]:
train_df_copy = remove_calc(train_df_copy)

train_df_copy.columns.values

In [None]:
train_df_copy.columns.value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))         
(train_df.isna().sum()*100/len(train_df)).round(2).plot(kind='bar', color='salmon');

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

### From the above plot of missing values in each feature, features `ps_car_03_cat` and `ps_car_05_cat` both have above ~50% missing values - it's safe to drop them as they'll add minimal value to the model.

In [None]:

train_df_copy.drop(['ps_car_03_cat', 'ps_car_05_cat'], axis=1, inplace=True)

In [None]:
train_df_copy.info()

In [None]:
train_df_copy.columns.values

### Replace all missing data designated as -1 per Kaggle description

In [None]:

categorical_column =[]
categorical_missing_data=[]
not_categorical = []  
# train_target = []
# train_id = []

def preprocess_data(data_df):
  data_df_copy = data_df.copy()

  if 'target' in data_df.columns:
    train_target = data_df.target
    data_df.drop(['target'], axis=1, inplace=True)
  if 'id' in data_df.columns:
    train_id = data_df.id
    data_df.drop(['id'], axis=1, inplace=True) 

  

  for label, content in data_df.items():    
    if '_cat'  in label:
      categorical_column.append(label)
      data_df[label].fillna(value=content.mode()[0], inplace=True)
      data_df[label] = data_df[label].astype('category')

    elif '_bin' in label:
      data_df[label].fillna(value=content.mode()[0], inplace=True)

    else:
      data_df[label].fillna(value=content.median(), inplace=True)
      not_categorical.append(label)    

    
  print(categorical_column)
  if 'target' in data_df_copy.columns:
    data_df.insert(loc=0, column='target', value=train_target)    
    # if (train_target.empty == True) :
      
  if ('id' in data_df_copy.columns):
    data_df.insert(loc=0, column='id', value= train_id)
    # if (train_id.empty == True):

  ### Remove outliers
  # #Dropping the outlier rows with standard deviation
  # factor = 4
  # for label, content in data_df.items():
  #   upper_lim = data_df[label].mean () + data_df[label].std () * factor
  #   lower_lim = data_df[label].mean () - data_df[label].std () * factor

  #   data = data_df[(data_df[label] < upper_lim) & (data_df[label] > lower_lim)]     

  return data_df       
        

In [None]:
preprocessed_train_data = preprocess_data(train_df_copy)



In [None]:
preprocessed_train_data.isna().sum()

In [None]:
preprocessed_train_data.info()

In [None]:
# shuffled_df = preprocessed_train_data

In [None]:
# # shuffled_df.drop(['id'], axis=1, inplace=True)
# shuffled_df[categorical_column].head(10)

In [None]:
len(preprocessed_train_data)

In [None]:
# # Extract Features and target

# X = shuffled_df.drop(['target', 'id'], axis=1)
# y=  shuffled_df['target']

#train_df_copy['ps_ind_02_cat'].value_counts()
len(categorical_column), len(categorical_missing_data), len(not_categorical)

### Convert Categorical Data to Numerical using one Hot Encoding

In [None]:
# # from sklearn.preprocessing import OneHotEncoder
# # from sklearn.compose import ColumnTransformer
# # categorical_features = categorical_column
# # one_hot = OneHotEncoder(sparse=False)
# # transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder='passthrough')

# # transformed_x = transformer.fit_transform(X)
# shuffled_df_encoded = pd.get_dummies(shuffled_df[categorical_column])


In [None]:
# shuffled_df_encoded.head()

In [None]:
# shuffled_df_encoded.isna().sum()

### After the one-hot encoding, we drop the original unencoded categorical columns, then one of the new encoded feature columns to reduce multicollinearity.

In [None]:
# shuffled_cat_dropped = shuffled_df.drop(categorical_column, axis=1)
# shuffled_df_encoded.drop(['ps_ind_02_cat_3.0'], axis=1, inplace=True)
# shuffled_cat_dropped.head()

### Concatenate the encoded categorical features with the other features less the unencoded categorical features

In [None]:
# shuffled_upd = pd.concat([shuffled_cat_dropped, shuffled_df_encoded], axis=1)

In [None]:
# shuffled_upd.head()

In [None]:
preprocessed_train_data.head()

In [None]:
def Encode_Scale(data_df,categorical_features):
  """
  Function takes a dataframe, and a list of categorical features, encodes the categorical features
  and scales same.

  """
  data_df_copy = data_df.copy()

  if 'target' in data_df.columns:
    train_target = data_df.target
    data_df.drop(['target'], axis=1, inplace=True)
  if 'id' in data_df.columns:
    train_id = data_df.id
    data_df.drop(['id'], axis=1, inplace=True) 



  #One-Hot Encoding of categorical data
  data_df_encoded = pd.get_dummies(data_df[categorical_column])
  data_df_encoded.head()
  data_df_encoded.isna().sum()

  ### After the one-hot encoding, we drop the original unencoded categorical columns,
  ### then one of the new encoded feature columns to reduce multicollinearity.


  data_cat_dropped = data_df.drop(categorical_column, axis=1)
  data_df_encoded.drop(['ps_ind_02_cat_3.0'], axis=1, inplace=True)
  data_cat_dropped.head()

  ### Concatenate the encoded categorical features with the other features less the unencoded categorical features

  data_upd = pd.concat([data_cat_dropped, data_df_encoded], axis=1)

  if 'target' in data_df_copy.columns:
    data_upd.insert(loc=0, column='target', value=train_target)    
    # if (train_target.empty == True) :
      
  if ('id' in data_df_copy.columns):
    data_upd.insert(loc=0, column='id', value= train_id)
    # if (train_id.empty == True):


  data_upd.head()

  # preferred_data = data_upd[preferred_features]

  # from sklearn.preprocessing import StandardScaler
  # X = StandardScaler().fit_transform(preferred_data)

  # X = pd.DataFrame(X)


  return data_upd


In [None]:
preprocessed_train_data.head()

In [None]:
shuffled_upd = Encode_Scale(preprocessed_train_data, categorical_column)

In [None]:
shuffled_upd.head()

In [None]:
# Extract Features and target

X = shuffled_upd.drop(['target', 'id'], axis=1)
y=  shuffled_upd['target']

### Let's further explore the individual features and their importance - using the resource - https://github.com/WillKoehrsen/feature-selector/blob/master/Feature%20Selector%20Usage.ipynb

In [None]:
from feature_selector import FeatureSelector
fs = FeatureSelector(X, y)

In [None]:
fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 
                                    'task': 'classification', 'eval_metric': 'auc', 
                                     'cumulative_importance': 0.99})

In [None]:
# justcheckit = fs.one_hot_features

In [None]:
shuffled_df_removed_all_once = fs.remove(methods = 'all', keep_one_hot = True)

In [None]:
shuffled_df_removed_all_once.shape

In [None]:
fs.plot_feature_importances(plot_n = 15, threshold=0.99)

In [None]:
preferred_features = np.array(fs.feature_importances[fs.feature_importances['cumulative_importance']<0.990402]['feature'])

In [None]:
len(preferred_features)

In [None]:
preferred_data = fs.data[preferred_features]
preferred_data.head()

In [None]:
# Using get_dummies to encode categorical features
# cat_df = pd.get_dummies(shuffled_df, columns=[categorical_column])
# cat_df.head()

In [None]:
### Lets scale the features to get them with same range of magnitude

from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(preferred_data)

In [None]:
X = pd.DataFrame(X)
X.head()


### We are going to use learning models from imblearn due to the imbalanced nature of the datasets - 

In [None]:
### Models used
# Models from Scilit-Learn

from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedStratifiedKFold
# from xgboost import XGBClassifier

# Model Evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import recall_score, f1_score, precision_score, accuracy_score, auc



In [None]:
# np.random.seed(42)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### Given the imbalanced nature of the dataset, let's deploy two balancing techniques -  Synthetic Minority Oversampling technique with the RandomForestClassifier estimator, and the BalancedBaggingClassifier estimator with it's underlying undersampling technique.


### From the above it's clear the data needs balancing - Let's do it!

In [None]:
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from imblearn.ensemble import BalancedRandomForestClassifier


# build model with embedded undersampling technique 
# param = {'num_leaves': 31, 'objective': 'binary'}
# param['metric'] = 'auc'
mpipeline = make_pipeline_imb(BalancedBaggingClassifier(base_estimator=lgb.LGBMClassifier(n_jobs=-1),
                                                   sampling_strategy='auto',
                                                   replacement=False,
                                                   random_state=0))
model = mpipeline.fit(X_train, y_train)
model.score(X_val, y_val)
bbc_pred = model.predict_proba(X_val)


In [None]:
# build model with SMOTE imblearn
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

X_train2, X_val2, y_train2, y_val2 = train_test_split(X_res, y_res, test_size = 0.2)
smote_model = LogisticRegression(n_jobs=-1)      #XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100)
smote_model.fit(X_train2, y_train2)
smote_score = smote_model.score(X_val2, y_val2)


In [None]:
smote_score

In [None]:
smote_pred = smote_model.predict_proba(X_val2)

In [None]:
smote_model.score(X_train2, y_train2)

In [None]:
smote_predict = smote_model.predict(X_val2)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_val, y_val)

### Hyperparameter Tunning of the model

In [None]:

# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# scores = cross_val_score(smote_model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
# score_accuracy = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
# score_accuracy.mean()

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

### Evaluation Metric for the model using Normalized Gini Coefficient as requested for the competition - all credits for the code due - `https://www.kaggle.com/tezdhar/faster-gini-calculation`

In [None]:
# Calculating the normalized gini coefficient.
def ginic(actual, pred):
    actual = np.asarray(actual) #In case, someone passes Series or list
    n = len(actual)
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    giniSum = a_c.sum() / a_s.sum() - (n + 1) / 2.0
    return giniSum / n
 
def gini_normalizedc(a, p):
    if p.ndim == 2:#Required for sklearn wrapper
        p = p[:,1] #If proba array contains proba for both 0 and 1 classes, just pick class 1
    return ginic(a, p) / ginic(a, a)


In [None]:
smote_pred[:,1]

In [None]:
gini_normalizedc(y_val, bbc_pred[:,1])

In [None]:
gini_normalizedc(y_val2, smote_pred[:, 1])

### Load test Dataset

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
test_df = pd.read_csv('../input/porto-seguro-safe-driver-prediction/test.csv', low_memory=False, na_values='-1')
test_df.head()

In [None]:
test_df.shape

In [None]:
test_df.isna().sum()

In [None]:
test_data_no_id = test_df.drop(['id'], axis=1)

In [None]:
### Check missing values in the test data
fig, ax = plt.subplots(figsize=(20,10))         
(test_df.isna().sum()*100/len(test_df)).round(2).plot(kind='bar', color='salmon');

In [None]:
### Remove fetures having more than 50% of it's data missing
test_df.drop(['ps_car_03_cat', 'ps_car_05_cat'], axis=1, inplace=True)

## Preprocess Test Dataset

In [None]:
# test_df.drop(['id'], axis=1, inplace=True)
categorical_column=[]
categorical_missing_data = []
not_categorical = []
preprocessed_test_df = preprocess_data(test_df)

In [None]:
len(categorical_column), len(categorical_missing_data), len(not_categorical)

In [None]:
preprocessed_test_df.isna().sum()

In [None]:
preprocessed_test_df.head()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))         
# Imbalanced DataFrame Correlation
corr = preprocessed_test_df.corr()
sns.heatmap(corr, cmap='RdYlBu', annot_kws={'size':30}, ax=ax)
ax.set_title("Feature Correlation Matrix", fontsize=14)
plt.show()

In [None]:
preprocessed_test_df   = remove_calc(preprocessed_test_df)

In [None]:
preprocessed_test_df.columns.values

### All features with `_calc_` designation in itself label has been remove since it has little or no correlation with other features - then of little consequence to the model

In [None]:
preprocessed_test_df.head()

In [None]:
preprocessed_test_df_upd = Encode_Scale(preprocessed_test_df, categorical_column)

In [None]:
# sum_feature_df= pd.DataFrame(transformed_testData_x[:10])
# sum_feature_df
preprocessed_test_df_upd.drop(['id'], axis=1, inplace=True)

In [None]:
preprocessed_test_data = preprocessed_test_df_upd[preferred_features]

In [None]:
preprocessed_test_data.shape

In [None]:
preprocessed_test_data.head()

In [None]:
preprocessed_test_data.head()

In [None]:
X_test = StandardScaler().fit_transform(preprocessed_test_data)

In [None]:
X_test = pd.DataFrame(X_test)
X_test.head()

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
X_test.shape

In [None]:
test_pred = smote_model.predict_proba(X_test)

In [None]:
test_pred[:,1][:20]

In [None]:
test_pred2 = model.predict_proba(X_test)

In [None]:
test_pred2[:,1][:20]

In [None]:
# preprocessed_test_df.head()

In [None]:
PIC_Submission = pd.DataFrame(test_pred2[:,1], columns=['target'], index=np.arange(0,len(preprocessed_test_df)))

In [None]:
PIC_Submission.head()

In [None]:
len(PIC_Submission)

In [None]:
#PIC_Submission.to_csv('submit_pred_2.csv')