In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score, classification_report
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#  Taking equal instances of positive and negative classes in training data

In [None]:
train = pd.read_csv("/kaggle/input/santander-customer-transaction-prediction/train.csv")

train_1 = train.loc[train['target'] == 1]
train_2 = train.loc[train['target'] == 0].head(20098)
train = pd.concat([train_1, train_2])

X = train.drop(['ID_code','target'],axis = 1)
y = train[['target']]

#  Spliting the training data to two parts. 2/3rd of data as train data and 1/3rd as validation data.

In [None]:
X_train_raw, X_valid_raw, y_train_raw, y_valid_raw = train_test_split(X, y, test_size=0.33,random_state=16,stratify=y)

# Checking the ratio of classes in train and validation data sets

In [None]:
print(len(y_train_raw.loc[y_train_raw['target'] == 1])/len(y_train_raw.loc[y_train_raw['target'] == 0]))
print(len(y_valid_raw.loc[y_valid_raw['target'] == 1])/len(y_valid_raw.loc[y_valid_raw['target'] == 0]))

# Scaling the features

In [None]:
def scale_minmax(df):
    scale_data = MinMaxScaler()  
    result_df = pd.DataFrame(scale_data.fit_transform(df.values), columns=df.columns, index=df.index)
    return result_df

# Applying scale to train and validation data sets.

In [None]:
X_train = scale_minmax(X_train_raw)
y_train = scale_minmax(y_train_raw)

X_valid = scale_minmax(X_valid_raw)
y_valid = scale_minmax(y_valid_raw)

# GridSearch to find the best hyperparameters

In [None]:
# from sklearn.model_selection import GridSearchCV

# grid_search_rfc = GridSearchCV(
#              estimator = RandomForestClassifier(),
#              param_grid={
#                  'n_jobs' : [-1],
#                  'n_estimators' : [500,1000,2000],
#                  'max_features' : ['sqrt', 'log2'],
#                  'random_state' : [33],
#                  'max_depth':[2,4,8,16,32,None]
#              }
#             )

# grid_search_rfc.fit(X_train,y_train.values.ravel())
# print(grid_search_rfc.best_estimator_)

# Model initialization using the parameters from the above cell

In [None]:
rf_clf = RandomForestClassifier(random_state = 33,
                                oob_score = True,
                                max_features = 'log2', 
                                n_estimators = 2000, 
                                n_jobs = -1,
                                bootstrap = True
                               )

rf_clf.fit(X_train,y_train.values.ravel())

# Out-of-bag score

In [None]:
print(rf_clf.oob_score_)

# Calculating accuracy using different measures on train data.

In [None]:
y_train_predictions = cross_val_predict(rf_clf, X_train, y_train.values.ravel(), cv = 3)

print('precision_score :'+str(precision_score(y_train, y_train_predictions)))
print('recall_score :'+str(recall_score(y_train, y_train_predictions)))
print('f1_score :'+str(f1_score(y_train, y_train_predictions)))

y_train_scores_acc = cross_val_score(rf_clf, X_train, y_train.values.ravel(), cv = 3)
print(y_train_scores_acc.mean())


In [None]:
confusion_matrix(y_train, y_train_predictions)

In [None]:
print(classification_report(y_train,y_train_predictions))

# Calculating accuracy using different measures on validation data.

In [None]:
y_valid_predictions = cross_val_predict(rf_clf, X_valid, y_valid.values.ravel(), cv = 3)

print('precision_score :'+str(precision_score(y_valid, y_valid_predictions)))
print('recall_score :'+str(recall_score(y_valid, y_valid_predictions)))
print('f1_score :'+str(f1_score(y_valid, y_valid_predictions)))

y_valid_scores_acc = cross_val_score(rf_clf, X_valid, y_valid.values.ravel(), cv = 3)
print(y_valid_scores_acc.mean())

# Preparing test data

In [None]:
test = pd.read_csv("/kaggle/input/santander-customer-transaction-prediction/test.csv")
test_id = test[['ID_code']]

X_test_raw = test.drop(['ID_code'],axis = 1)
X_test = scale_minmax(X_test_raw)

# Prediction on test data and writing the predictions to csv

In [None]:
y_test = rf_clf.predict(X_test)

op=pd.DataFrame()
op['ID_code']=test_id
op['target']=y_test

op.to_csv("submission_random_tree_tuned_params_v6.csv",index=False)
print('File created')