In [1]:
# kaggle competitions download -c santander-customer-transaction-prediction
import os
import pandas as pd
import numpy as np
import csv

%matplotlib inline
import matplotlib.pyplot as plt

DATA_PATH = os.path.join("datasets", "CustTransPrediction")

def load_data(filename, data_path=DATA_PATH):
    csv_path = os.path.join(data_path, filename)
    return pd.read_csv(csv_path)

# Create submission file with assigned predicted results from models
def create_file_for_submission(filename, classifier_predictions):
    
    classifier_predictions = np.reshape(classifier_predictions, (classifier_predictions.shape[0], 1))
    print("The reshape of Prediction numpy array : ", classifier_predictions.shape)

    classifier_predicted_results = np.concatenate((iD_code, classifier_predictions), axis=1)
    print("The concatenation of iD_code and Prediction numpy arrays  : ", classifier_predicted_results.shape)

    print("For iD_codes : ", classifier_predicted_results[0:10,0])
    print("The respective predicted results : ", classifier_predicted_results[0:10,1])
    
    # Create and overwrite existing file
    with open('datasets/CustTransPrediction/' + filename, 'w') as writeFile:
        filewriter = csv.writer(writeFile, delimiter=',',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        filewriter.writerow(['ID_code', 'target'])

        iteration_range = classifier_predicted_results.shape[0]
        for i in range(iteration_range):
            filewriter.writerow([str(classifier_predicted_results[i,0]), str(classifier_predicted_results[i,1])])

    writeFile.close()

In [2]:
# Loading the respective Train and Test data
train_data = load_data("train.csv")
test_data = load_data("test.csv")

In [3]:
train_data.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to var_199
dtypes: float64(200), int64(1), object(1)
memory usage: 308.2+ MB


In [5]:
test_data.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,-2.1556,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,10.6165,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.1293,-20.976
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,-0.7484,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.8956,-23.1794
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,4.2259,9.1723,1.2835,3.3778,19.5542,-0.286,-5.1612,7.2882,13.926,-9.1846


In [6]:
X = train_data.drop(["ID_code", "target"], axis=1)
y = train_data["target"].copy()

X.shape, y.shape

((200000, 200), (200000,))

In [7]:
X_test = test_data.drop(['ID_code'], axis=1)
X_test.shape

(200000, 200)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

poly_scaler = Pipeline([
    #("poly_features", PolynomialFeatures(degree=2, include_bias=False)), # degree=10
    ("standard_scaler", StandardScaler()) ])

#X_train_poly_scaled = poly_scaler.fit_transform(X_train)
#X_val_poly_scaled = poly_scaler.transform(X_val)
#X_test_poly_scaled = poly_scaler.transform(X_test)

# If degree=3, there is a Memory Error

- For Polynomial Features of degree 10, scaling will take more than 1 hour on a t2.instance

In [9]:
X_train = poly_scaler.fit_transform(X)
y_train = y

y_train.value_counts()

0    179902
1     20098
Name: target, dtype: int64

# Stochastic Gradient Descent

In [10]:
from sklearn.model_selection import train_test_split

# X_train.shape is (200000, 200) and y_train.shape is (200000,)
# ravel returns a contiguous flattened array giving (x,) instead of (x, 1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train.ravel(), test_size=0.2, random_state=42)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((160000, 200), (40000, 200), (160000,), (40000,))

In [11]:
from sklearn.linear_model import SGDClassifier
sgd_classifier = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42, class_weight='balanced')
sgd_classifier.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight='balanced',
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=42, shuffle=True, tol=-inf, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [12]:
from sklearn.metrics import accuracy_score
y_prediction = sgd_classifier.predict(X_val)
accuracy_score(y_val, y_prediction)

0.7083

In [13]:
# Scikit-Learn Cross-Validation
from sklearn.model_selection import cross_val_score

sgd_scores = cross_val_score(sgd_classifier, X_train, y_train, cv=3, scoring="accuracy")
sgd_scores.mean()

0.6994374795151931

In [14]:
# Scale X_test and ready for predictions
X_test_scaled = poly_scaler.fit_transform(X_test)

sgd_predictions = sgd_classifier.predict(X_test_scaled)
sgd_predictions.shape

(200000,)

- Create submission file

In [15]:
# Convert from DataFrames to Numpy Array
iD_code = test_data['ID_code'].values # get from DataFrame to Numpy Array
iD_code = np.reshape(iD_code, (iD_code.shape[0], 1))

In [16]:
create_file_for_submission("sgd_classifier.csv", sgd_predictions)

The reshape of Prediction numpy array :  (200000, 1)
The concatenation of iD_code and Prediction numpy arrays  :  (200000, 2)
For iD_codes :  ['test_0' 'test_1' 'test_2' 'test_3' 'test_4' 'test_5' 'test_6' 'test_7'
 'test_8' 'test_9']
The respective predicted results :  [1 1 0 0 0 0 0 0 0 0]


In [17]:
from sklearn.svm import LinearSVC

svm_classifier = LinearSVC(class_weight='balanced')
svm_classifier.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [18]:
# Using Cross Validation to check the performance of Support Vector Machine
svm_scores = cross_val_score(svm_classifier, X_train, y_train, cv=3)
svm_scores.mean()



0.8448374789356233

In [19]:
svm_predictions = svm_classifier.predict(X_test_scaled)
svm_predictions.shape

(200000,)

In [20]:
create_file_for_submission("svm_classifier.csv", svm_predictions)

The reshape of Prediction numpy array :  (200000, 1)
The concatenation of iD_code and Prediction numpy arrays  :  (200000, 2)
For iD_codes :  ['test_0' 'test_1' 'test_2' 'test_3' 'test_4' 'test_5' 'test_6' 'test_7'
 'test_8' 'test_9']
The respective predicted results :  [1 1 0 1 0 0 0 0 0 0]


In [21]:
from sklearn.ensemble import VotingClassifier

voting_classifier = VotingClassifier(
    estimators=[('sgd_classifier', sgd_classifier),
                ('svm_classifier', svm_classifier)], # svm_prob_classifier
    voting='hard')

voting_classifier.fit(X_train, y_train)

voting_classifier.get_params() # gives parameters of the VotingClassifier



{'estimators': [('sgd_classifier',
   SGDClassifier(alpha=0.0001, average=False, class_weight='balanced',
                 early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
                 l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
                 n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
                 random_state=42, shuffle=True, tol=-inf, validation_fraction=0.1,
                 verbose=0, warm_start=False)),
  ('svm_classifier',
   LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
             intercept_scaling=1, loss='squared_hinge', max_iter=1000,
             multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
             verbose=0))],
 'flatten_transform': True,
 'n_jobs': None,
 'voting': 'hard',
 'weights': None,
 'sgd_classifier': SGDClassifier(alpha=0.0001, average=False, class_weight='balanced',
               early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,


In [22]:
voting_classifier_scores = cross_val_score(voting_classifier, X_train, y_train, cv=3)
voting_classifier_scores.mean()



0.8673812014423247

In [23]:
voting_classifier_predictions = voting_classifier.predict(X_test_scaled)

voting_classifier_predictions.shape

(200000,)

In [24]:
create_file_for_submission("voting_classifier.csv", voting_classifier_predictions)

The reshape of Prediction numpy array :  (200000, 1)
The concatenation of iD_code and Prediction numpy arrays  :  (200000, 2)
For iD_codes :  ['test_0' 'test_1' 'test_2' 'test_3' 'test_4' 'test_5' 'test_6' 'test_7'
 'test_8' 'test_9']
The respective predicted results :  [1 1 0 0 0 0 0 0 0 0]


# Submissions
- kaggle competitions submit -c santander-customer-transaction-prediction -f submission.csv -m "Message"
- Submitted Public score of 0.516 at position 5,822
- Submitted Public score of 0.593 at position 7,912
- Submitted Public score of 0.749 at position 7,082 with balanced weight SVM Classifier on scaled test data having mean cv score of 84.645%.
- Submitted Public score of 0.627 with Hard Voting Classifier having mean cv score of 89.886%.

# Using Random Forest Classifier

- The cross validation will take more than 2.5 hours on t2 instance
- It takes about 1.3 hours on GCP n1-standard-4 (4 vCPUs, 15 GB memory)

In [25]:
# Ensemble Learning : Using Random Forest Classifier for prediction
from sklearn.ensemble import RandomForestClassifier

# Using Cross Validation to check the performance of Random Forest Classifier
from sklearn.model_selection import cross_val_score

X_train = poly_scaler.fit_transform(X)
y_train = y
forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_classifier, X_train, y_train, n_jobs= 4, cv=10)
forest_scores.mean()

# Mean score of 89.954%

0.8995449996495875

- The model fit and prediction also takes more than 0.5 hours on t2 instance
- It also takes more than 0.5 hours on GCP n1-standard-4 (4 vCPUs, 15 GB memory)

In [27]:
# Making predictions on the test set with Random Forest Classifier
forest_classifier.fit(X_train, y_train)
y_randomforest_predictions = forest_classifier.predict(X_test_scaled)

y_randomforest_predictions.shape

(200000,)

In [28]:
y_randomforest_predictions[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [29]:
# Convert from DataFrames to Numpy Array
iD_code = test_data['ID_code'].values # get from DataFrame to Numpy Array
iD_code = np.reshape(iD_code, (iD_code.shape[0], 1))
print("The reshape of ID_code numpy array : ", iD_code.shape)

y_randomforest_predictions = np.reshape(y_randomforest_predictions, (y_randomforest_predictions.shape[0], 1))
print("The reshape of Random Forest Prediction numpy array : ", y_randomforest_predictions.shape)

randomforest_predicted_results = np.concatenate((iD_code, y_randomforest_predictions), axis=1)
print("The concatenation of iD_code and Prediction numpy arrays  : ", randomforest_predicted_results.shape)

print("For iD_codes : ", randomforest_predicted_results[0:10,0])
print("The respective predicted results : ", randomforest_predicted_results[0:10,1])

The reshape of ID_code numpy array :  (200000, 1)
The reshape of Random Forest Prediction numpy array :  (200000, 1)
The concatenation of iD_code and Prediction numpy arrays  :  (200000, 2)
For iD_codes :  ['test_0' 'test_1' 'test_2' 'test_3' 'test_4' 'test_5' 'test_6' 'test_7'
 'test_8' 'test_9']
The respective predicted results :  [0 0 0 0 0 0 0 0 0 0]


- OSError: [Errno 12] Cannot allocate memory on t2 instance
- It takes more than 1.3 hours on GCP n1-standard-4 (4 vCPUs, 15 GB memory)

In [None]:
# Search the best combination of hyperparameter values
from sklearn.model_selection import GridSearchCV

kfold=10
rf_param_grid = {"max_depth": [None],
                 "max_features": ['auto'], # [1, 3, 10],
                 "min_samples_split": [2, 3, 10],
                 "min_samples_leaf": [1, 3, 10],
                 "bootstrap": [False],
                 "n_estimators" :[100, 200, 500],
                 "criterion": ["gini"]}

# Tuning the param for GridSearch and performance has been increased to approx. 83.40% 
grid_search = GridSearchCV(forest_classifier, rf_param_grid, cv=kfold, scoring="accuracy", n_jobs= 4, verbose = 1) # without scoring parameter
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 27 candidates, totalling 270 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
grid_search.best_params_

In [None]:
# Using GridSearchCV to obtain final model with best estimator
best_random_forest_classifier_model = grid_search.best_estimator_
best_random_forest_classifier_model

In [None]:
best_random_forest_classifier_scores = cross_val_score(best_random_forest_classifier_model, X_train, y_train,
                                                       n_jobs= 4, cv=10)
best_random_forest_classifier_scores.mean()

In [None]:
best_randomforest_predictions = best_random_forest_classifier_model.predict(X_test_scaled)
best_randomforest_predictions.shape

create_file_for_submission("best_random_forest_classifier.csv", best_randomforest_predictions)

# Stochastic Gradient Descent

In [None]:
# Stochastic Gradient Descent optimizing squared error cost with default learning rate and no regularization
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import clone

sgd_regressor = SGDRegressor(max_iter=1, tol=-np.infty, penalty=None, eta0=0.0005,
                             warm_start=True, learning_rate="constant", random_state=42)

num_epochs = 10
minimum_val_error = float("inf")
best_epoch = None
best_sgd_regressor_model = None

for epoch in range(num_epochs):
    # With warm_start=True and fit() is called, it will continue where it left off instead of restarting from scratch
    sgd_regressor.fit(X_train, y_train)
    y_val_predict = sgd_regressor.predict(X_val)

    val_error = mean_squared_error(y_val, y_val_predict)
    
    print("Current epoch :", epoch, "with mse :", val_error)

    if val_error < minimum_val_error:
        print("Epoch : ", epoch, "with min mse :", val_error)
        minimum_val_error = val_error
        best_epoch = epoch
        best_sgd_regressor_model = clone(sgd_regressor)

best_epoch, best_sgd_regressor_model

In [None]:
y_val_predict.shape

In [None]:
best_sgd_regressor_model.fit(X_train, y_train)
sgd_predictions = best_sgd_regressor_model.predict(X_test_scaled)

# Making predictions on the test set
#y_sgd_predictions = best_sgd_regressor_model.predict(X_val_poly_scaled)

sgd_predictions.shape

In [None]:
sgd_score = best_sgd_regressor_model.score

sgd_score

In [None]:
create_file_for_submission("sgd_regressor.csv", sgd_predictions)

# SGDRegressor' object has no attribute 'decision_function'

In [None]:
# Scikit-Learn Cross-Validation Prediction
from sklearn.model_selection import cross_val_predict

# Compare classifiers by measuring the "area under the curve" (AUC) = 1 means perfect classifier
from sklearn.metrics import roc_auc_score

y_scores = cross_val_predict(best_sgd_regressor_model, X_train, y_train, cv=3, method="decision_function")
roc_auc_score(y_train, y_scores)