# Build models without any feature engineering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [38]:
train_data = pd.read_csv(r"G:\IIITD\SML\Project\elo_merchant_category\train.csv")
train_data.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [39]:
test_data = pd.read_csv(r"G:\IIITD\SML\Project\elo_merchant_category\test.csv")
test_data.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04,C_ID_0ab67a22ab,3,3,1
1,2017-01,C_ID_130fd0cbdd,2,3,0
2,2017-08,C_ID_b709037bc5,5,1,1
3,2017-12,C_ID_d27d835a9f,2,1,0
4,2015-12,C_ID_2b5e3df5c2,5,1,1


In [40]:
train_data.drop(['first_active_month'], axis=1, inplace=True)
test_data.drop(['first_active_month'], axis=1, inplace=True)

In [41]:
# Move target column to the second index
target = train_data['target']
train_data.drop(labels=['target'], axis=1,inplace = True)
train_data.insert(1, 'target', target)
train_data.columns

Index(['card_id', 'target', 'feature_1', 'feature_2', 'feature_3'], dtype='object')

## Linear Regression Model

In [43]:
from sklearn import linear_model
def linear_regression(train_data, test_data):
    
    X_train = train_data.iloc[:, 2:].values
    Y_train = train_data.iloc[:, 1].values
    X_test = test_data.iloc[:, 1:].values
    
    
    # Create linear regression object 
    regr = linear_model.LinearRegression()     
    regr.fit(X_train, Y_train)
    
    # Train predictions
    y_pred_train = regr.predict(X_train)
    print("Train predictions: ", y_pred_train)
    
    # Compute mean squared error
    error = (Y_train - y_pred_train)**2
    mse = np.sum(error)
    mse = mse/len(Y_train)
    rmse = np.sqrt(mse)
    print("Root Mean squared error on the train data: ", rmse)
    
    #Prediction
    y_pred=regr.predict(X_test)
    print("Test predictions: ",y_pred)
    
    return y_pred

linear_predictions = linear_regression(train_data, test_data)
    

Train predictions:  [-0.49973716 -0.4181723  -0.35435852 ... -0.50566923 -0.39217492
 -0.34842645]
Root Mean squared error on the train data:  3.849965430724074
Test predictions:  [-0.43592338 -0.39810698 -0.4559887  ... -0.4559887  -0.31061005
 -0.4559887 ]


## Random Forest Model

In [47]:
from sklearn.ensemble import RandomForestRegressor
def random_forest(train_data, test_data):
    
    X_train = train_data.iloc[:, 2:].values
    Y_train = train_data.iloc[:, 1].values
    X_test = test_data.iloc[:, 1:].values
    
    
    rf = RandomForestRegressor()   
    rf.fit(X_train, Y_train)
    
    # Train predictions
    y_pred_train = rf.predict(X_train)
    print("Train predictions: ", y_pred_train)
    
    # Compute mean squared error
    error = (Y_train - y_pred_train)**2
    mse = np.sum(error)
    mse = mse/len(Y_train)
    rmse = np.sqrt(mse)
    print("Root Mean squared error on the train data: ", rmse)
    
    #Prediction
    y_pred=rf.predict(X_test)
    print("Test predictions: ",y_pred)
    
    return y_pred

forest_predictions = random_forest(train_data, test_data)
    

Train predictions:  [-0.40953328 -0.40705137 -0.30249897 ... -0.37727926 -0.29911306
 -0.26717217]
Root Mean squared error on the train data:  3.848821699727652
Test predictions:  [-0.56700389 -0.31446568 -0.56261871 ... -0.56261871 -0.40648854
 -0.56261871]


## Light Gbm Model

In [50]:
import lightgbm as lgb
def Light_gbm_model(train_data, test_data):
    
    X_train = train_data.iloc[:, 2:].values
    Y_train = train_data.iloc[:, 1].values
    X_test = test_data.iloc[:, 1:].values
    
    d_train = lgb.Dataset(X_train, label=Y_train)
    params = {}
    params['learning_rate'] = 0.001
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'mse'
    params['num_leaves'] = 300
    params['min_data'] = 200
    clf = lgb.train(params, d_train, 500)
    
    # Train predictions
    y_pred_train = clf.predict(X_train)
    print("Train predictions: ", y_pred_train)
    
    # Compute mean squared error
    error = (Y_train - y_pred_train)**2
    mse = np.sum(error)
    mse = mse/len(Y_train)
    rmse = np.sqrt(mse)
    print("Mean squared error on the train data: ", rmse)
    
    #Prediction
    y_pred=clf.predict(X_test)
    print("Test predictions: ",y_pred)
    
    return y_pred

light_gbm_predictions = Light_gbm_model(train_data, test_data) 

Train predictions:  [-0.399158   -0.40156964 -0.3578372  ... -0.38891576 -0.35539181
 -0.34378152]
Mean squared error on the train data:  3.849434804798477
Test predictions:  [-0.46266541 -0.36084082 -0.45952297 ... -0.45952297 -0.39981425
 -0.45952297]


## XGB model

In [54]:
def xgb_model(train_data, test_data):
    
    train = train_data.iloc[:, 2:].values
    target = train_data.iloc[:, 1].values
    test = test_data.iloc[:, 1:].values
    
    # Split the training dataset
    X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, random_state=123)
    
    print("X_train", X_train.shape)
    print("X_test", X_test.shape)
    print("Y_train", y_train.shape)
    print("Y_test", y_test.shape)
    
    classify = xgb.XGBRegressor(objective ='reg:squarederror')
    model = classify.fit(X_train,y_train)
    
    test_pred = model.predict(X_test)
    print(test_pred)
    y_test = np.asarray(y_test)
    print(y_test)
    
    error = (y_test - test_pred)**2
    mse = np.sum(error)
    mse = mse/len(y_test)
    rmse = np.sqrt(mse)
    print("Mean Squared Error of testing data", rmse)
    
    xgb_test_pred = model.predict(test)
    print("Predictions on the testing dataset:", xgb_test_pred)
    
    return xgb_test_pred

xgb_predictions = xgb_model(train_data, test_data)

X_train (161533, 3)
X_test (40384, 3)
Y_train (161533,)
Y_test (40384,)
[-0.31856078 -0.5720693  -0.37571162 ... -0.5708957  -0.31856078
 -0.31856078]
[ 4.39027725 -0.68812574 -0.57395811 ...  0.07554159 -0.12086987
  0.96800893]
Mean Squared Error of testing data 3.7186839615039604
Predictions on the testing dataset: [-0.5708957  -0.3231389  -0.5720693  ... -0.5720693  -0.42832512
 -0.5720693 ]


In [55]:
def save_predictions(predictions, test_data):
    df = pd.DataFrame()
    df['card_id'] = test_data['card_id']
    df['target'] = predictions
    df.to_csv(r"G:\IIITD\SML\Project\elo_merchant_category\Baseline\submit_xgb.csv")
    
save_predictions(xgb_predictions, test_data)