In [3]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler,LabelBinarizer
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor ,RandomForestRegressor ,GradientBoostingRegressor, BaggingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import warnings
warnings.filterwarnings('ignore')
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns',None)

# Running Models on Train data

In [4]:
# Loading cleaned dataframe

cdf_cleaned = pd.read_csv('sat_train_cleaned.csv')

In [5]:
# Label encoding account type and gender back from Train dataset

df_cat = pd.read_csv('Train.csv')
df_cat = df_cat[['account_type','gender']]
lb = LabelBinarizer()
for col in df_cat.columns:
    df_cat[col] = lb.fit_transform(df_cat[col])
df_cat.head()

Unnamed: 0,account_type,gender
0,0,1
1,0,1
2,0,1
3,0,0
4,0,1


In [6]:
# Splitting Features and Target in X and y and updating encoded account_type and gender in features X

X = np.log(cdf_cleaned.iloc[:,:-1])
X = pd.concat([df_cat,X],axis=1)
y = np.log(cdf_cleaned.cc_cons)

In [7]:
# Splitting into train and test data

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [8]:
# Using Evaluation matrix Root Mean Squared Logarithmic Error.

def rmsle(actual_column, predicted_column):
    sum=0.0
    for x,y in zip(actual_column,predicted_column):
        if x<0 or y<0: #check for negative values. 
            continue
        p = np.log(y+1)
        r = np.log(x+1)
        sum = sum + (p - r)**2
    return (sum/len(predicted_column))**0.5

In [9]:
# First Model : Linear Regression

linreg = LinearRegression()
linreg.fit(X_train,y_train)
y_pred = linreg.predict(X_test)
print('Y-Pred :\n',y_pred)
linreg_rmsle = rmsle(y_test,y_pred)
print("\nRMSLE for Linear Regression :\n",linreg_rmsle)

Y-Pred :
 [9.70346684 9.67443933 9.58473694 ... 9.7414335  9.65756543 9.62143431]

RMSLE for Linear Regression :
 0.13184094956920464


In [10]:
# Decision Tree Regressor

dt_reg = DecisionTreeRegressor(max_depth=2)
dt_reg.fit(X_train,y_train)
y_pred_dtr = dt_reg.predict(X_test)
print('Y-Pred :\n',y_pred_dtr)
dt_reg_rmsle = rmsle(y_test,y_pred_dtr)
print("\nRMSLE for Decision Tree Regressor :\n",dt_reg_rmsle)


Y-Pred :
 [9.64319642 9.64319642 9.64319642 ... 9.70612265 9.64319642 9.64319642]

RMSLE for Decision Tree Regressor :
 0.13191585370014866


In [11]:
# Bagging Regressor

bagging_reg = BaggingRegressor(base_estimator=DecisionTreeRegressor(),n_estimators=100,max_samples=100,random_state=0)
bagging_reg.fit(X_train,y_train)
y_pred_bagging = bagging_reg.predict(X_test)
print("Y-Pred :\n",y_pred_bagging)
bagging_rmsle = rmsle(y_test,y_pred_bagging)
print("\nRMSLE for Bagging Regressor :\n",bagging_rmsle)

Y-Pred :
 [9.81265178 9.48089827 9.50416926 ... 9.77839701 9.54476912 9.42031828]

RMSLE for Bagging Regressor :
 0.13315442764848145


In [12]:
# Pasting Regressor

pasting_reg = BaggingRegressor(base_estimator=DecisionTreeRegressor(),n_estimators=100,max_samples=100,bootstrap=False, \
                               random_state=0)
pasting_reg.fit(X_train,y_train)
y_pred_pasting = pasting_reg.predict(X_test)
print("Y-Pred :\n",y_pred_pasting)
pasting_rmsle = rmsle(y_test,y_pred_pasting)
print("\nRMSLE for Pasting Regressor :\n",pasting_rmsle)

Y-Pred :
 [9.78945231 9.41492543 9.52396423 ... 9.92472798 9.58080125 9.46956119]

RMSLE for Pasting Regressor :
 0.13351153292063792


In [13]:
# Random Forest Regression

rf_reg = RandomForestRegressor(n_estimators=100,n_jobs=100,min_samples_leaf=100,random_state=0)
rf_reg.fit(X_train,y_train)
y_pred_rf = rf_reg.predict(X_test)
print('Y-Pred :\n',y_pred_rf)
rf_rmsle = rmsle(y_test,y_pred_rf)
print('\nRMSLE for Random Forest Regressor :\n',rf_rmsle)

Y-Pred :
 [9.71207666 9.67119051 9.53900911 ... 9.73874822 9.60698943 9.6168988 ]

RMSLE for Random Forest Regressor :
 0.1320447291227798


# Running models on Test data

In [24]:
test_df = pd.read_csv("Test.csv")
test_df.shape

(5000, 43)

In [27]:
test_ids = test_df.copy()
ids = test_ids['ID']
test_acc_gender = test_df[['account_type','gender']]

In [28]:
def clean_test(dataset):
    # Merging features
    dataset['total_cc_cons'] = (dataset.cc_cons_apr + dataset.cc_cons_may + dataset.cc_cons_jun)
    dataset['total_dc_cons'] = (dataset.dc_cons_apr + dataset.dc_cons_may + dataset.dc_cons_jun)
    dataset['total_cc_count'] = (dataset.cc_count_apr + dataset.cc_count_may + dataset.cc_count_jun)
    dataset['total_dc_count'] = (dataset.dc_count_apr + dataset.dc_count_may + dataset.dc_count_jun)
    dataset['total_debit_amount'] = (dataset.debit_amount_apr + dataset.debit_amount_may + dataset.debit_amount_jun)
    dataset['total_credit_amount'] = (dataset.credit_amount_apr + dataset.credit_amount_may + dataset.credit_amount_jun)
    dataset['total_debit_count'] = (dataset.debit_count_apr + dataset.debit_count_may + dataset.debit_count_jun)
    dataset['total_credit_count'] = (dataset.credit_count_apr + dataset.credit_count_may + dataset.credit_count_jun)
    dataset['total_investments'] = (dataset.investment_1 + dataset.investment_2 + dataset.investment_3 + dataset.investment_4)
    return dataset

test_df = clean_test(test_df)

In [29]:
def drop_cols(dataset):
    
    # droping columns
    dataset.drop(['cc_cons_apr','cc_cons_may','cc_cons_jun','dc_cons_apr','dc_cons_may','dc_cons_jun',\
                     'cc_count_apr','cc_count_may','cc_count_jun','dc_count_apr','dc_count_may','dc_count_jun',\
                    'investment_1','investment_2','investment_3','investment_4',\
                    'personal_loan_active','personal_loan_closed','vehicle_loan_active','vehicle_loan_closed',\
                    'debit_amount_apr','debit_amount_may','debit_amount_jun','credit_amount_apr','credit_amount_may','credit_amount_jun',\
                    'loan_enq','debit_count_apr','credit_count_apr','debit_count_may','credit_count_may',\
                    'debit_count_jun','credit_count_jun','ID','account_type','gender'],axis=1,inplace=True)
    return dataset

test_df = drop_cols(test_df)

In [30]:
def treat_outliers(dataframe):
    cols = list(dataframe)
    for col in cols:
        if col in dataframe.select_dtypes(include=np.number).columns:
            dataframe[col] = winsorize(dataframe[col], limits=[0.05, 0.1],inclusive=(True, True))
    
    return dataframe    

test_df = treat_outliers(test_df)

In [35]:
# Taking log of features of test data

test_df = np.log(test_df)
test_df.head()


Unnamed: 0,age,region_code,card_lim,max_credit_amount_apr,max_credit_amount_may,max_credit_amount_jun,emi_active,total_cc_cons,total_dc_cons,total_cc_count,total_dc_count,total_debit_amount,total_credit_amount,total_debit_count,total_credit_count,total_investments
0,3.610918,6.35437,11.736069,10.105816,9.185023,10.16377,7.065178,9.802705,9.766837,2.995732,4.691348,11.310655,11.673061,3.871201,4.143135,12.828078
1,3.496508,5.976351,10.985293,9.220291,9.739026,9.410338,9.707926,10.645964,8.529106,2.995732,4.744932,11.029059,12.03663,4.795791,3.988984,13.318141
2,3.970292,5.780744,10.819758,9.612467,10.502269,9.489108,7.871799,9.187916,9.313639,3.73767,3.970292,12.342922,11.613477,4.442651,3.970292,13.572578
3,3.496508,5.913503,13.161584,11.459651,11.490935,11.188815,5.831266,10.624487,10.560412,5.141664,2.833213,12.357511,12.209821,3.931826,3.218876,13.103155
4,4.127134,6.224558,11.931636,9.581559,10.237671,10.653582,7.941722,9.190227,9.337129,4.077537,3.433987,12.29373,11.321268,4.795791,4.343805,14.182173


In [34]:
#  Label encoding account_type and gender

lb = LabelBinarizer() 

for col in test_acc_gender.columns:
    test_acc_gender[col] = lb.fit_transform(test_acc_gender[col])

test_acc_gender.head()

Unnamed: 0,account_type,gender
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1


In [37]:
# Combining test features with acount type and gender

test_X = pd.concat([test_acc_gender,test_df],axis=1)

In [39]:
test_X.head()

Unnamed: 0,account_type,gender,age,region_code,card_lim,max_credit_amount_apr,max_credit_amount_may,max_credit_amount_jun,emi_active,total_cc_cons,total_dc_cons,total_cc_count,total_dc_count,total_debit_amount,total_credit_amount,total_debit_count,total_credit_count,total_investments
0,0,1,3.610918,6.35437,11.736069,10.105816,9.185023,10.16377,7.065178,9.802705,9.766837,2.995732,4.691348,11.310655,11.673061,3.871201,4.143135,12.828078
1,0,1,3.496508,5.976351,10.985293,9.220291,9.739026,9.410338,9.707926,10.645964,8.529106,2.995732,4.744932,11.029059,12.03663,4.795791,3.988984,13.318141
2,0,1,3.970292,5.780744,10.819758,9.612467,10.502269,9.489108,7.871799,9.187916,9.313639,3.73767,3.970292,12.342922,11.613477,4.442651,3.970292,13.572578
3,0,1,3.496508,5.913503,13.161584,11.459651,11.490935,11.188815,5.831266,10.624487,10.560412,5.141664,2.833213,12.357511,12.209821,3.931826,3.218876,13.103155
4,0,1,4.127134,6.224558,11.931636,9.581559,10.237671,10.653582,7.941722,9.190227,9.337129,4.077537,3.433987,12.29373,11.321268,4.795791,4.343805,14.182173


In [42]:
# Linear Regression on Test data to get Predictions.

linreg1 = LinearRegression()
linreg1.fit(X_train,y_train)
y_predictions_lr = linreg1.predict(test_X)
print(y_predictions_lr)
y_predictions = np.exp(y_predictions_lr)
print("\nY-Predictiions using Linear Regression :\n",y_predictions)

[9.58010435 9.66523593 9.56403619 ... 9.55839873 9.63750673 9.65266118]

Y-Predictiions using Linear Regression :
 [14473.92952032 15760.08777596 14243.21863742 ... 14163.14907637
 15329.07649481 15563.14931286]


In [50]:
final_pred = pd.concat([ids,pd.Series(y_predictions)],axis=1,keys=['ID','cc_cons'])
final_pred = final_pred.set_index('ID')

In [53]:
final_pred.to_csv('C:/Data Science/Jupyter Notebook/Hackathon Updated/Final Prediction/Test_Prediction.csv')