#### Import libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#### Import Dataset

In [None]:
df_train=pd.read_csv("/kaggle/input/term-deposit-prediction-data-set/train.csv")
df_test=pd.read_csv("/kaggle/input/term-deposit-prediction-data-set/test.csv")

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.shape


In [None]:
df_test.shape

In [None]:
df_train.info()

#### Correlations between continous variables in train data

In [None]:
correlations=df_train.corr()
plt.figure(figsize=(12,8))
sns.heatmap(data=correlations,square=True,annot = True,cmap="viridis")

plt.yticks(rotation=0)
plt.xticks(rotation=90)

#### Summarize train data

In [None]:
df_train.describe()

In [None]:
df_train['subscribed'].value_counts()

In [None]:
df_test.isnull().sum()

#### Summarize test data

In [None]:
df_test.describe()

#### Checking missing values in test data

In [None]:
df_test.isnull().sum()

#### Encoding categorical variables in training data

In [None]:
df_train.head()

In [None]:
category_variables=['job','marital','education','default','housing','loan','contact','month','poutcome']
df_train_encoded=pd.get_dummies(df_train[category_variables])
df_train_encoded.shape

In [None]:
df_train_encoded.head()

In [None]:
df_train_encoded.info()

#### Replace categorical variables with dummy variables in training data

In [None]:
df=df_train.drop(['job','marital','education','default','housing','loan','contact','month','poutcome'],axis=1)
df=pd.concat([df,df_train_encoded],axis=1)

In [None]:
df.head()

In [None]:
df.dtypes

#### Encoding categorical variables in testing data

In [None]:
df_test.head()

In [None]:
category_variables=['job','marital','education','default','housing','loan','contact','month','poutcome']
df_test_encoded=pd.get_dummies(df_test[category_variables])
df_test_encoded.shape

In [None]:
df_test_encoded.info()

#### Replace categorical variables with dummy variables in test data

In [None]:
df1_test=df_test.drop(['job','marital','education','default','housing','loan','contact','month','poutcome'],axis=1)
df1_test=pd.concat([df1_test,df_test_encoded],axis=1)

In [None]:
df1_test.head()

#### Perform label encoding on the target column in the train

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['subscribed'] = label_encoder.fit_transform(df['subscribed'])

#### Checking dependent variable classes

In [None]:
df['subscribed'].value_counts()

In [None]:
sns.countplot(x='subscribed',data=df,palette='husl')

In [None]:
count_no_subscribed=len(df[df['subscribed']==0])
count_subscribed=len(df[df['subscribed']==1])
pct_no_subscribed=(count_no_subscribed/(count_no_subscribed+count_subscribed))*100
pct_subscribed=(count_subscribed/(count_no_subscribed+count_subscribed))*100
print("percentage of no subscription is ",pct_no_subscribed)
print("percentage of subscription ",pct_subscribed)

Here our dependent variable classes are imbalanced, and the ratio of no-subscription to subscription instances is 88:12. So we have to balance the classes.

In [None]:
X=df.drop(['ID','subscribed'],axis=1)
y=df['subscribed']

In [None]:
pip install --upgrade scikit-learn

#### Perform SMOTE to balance the dependent variable classes

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns

In [None]:
y_train

In [None]:
# Fit the SMOTE
os = SMOTE(random_state=0)
os_data_X,os_data_y=os.fit_sample(X_train, y_train)

In [None]:
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['subscribed'])

In [None]:
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['subscribed']==0]))
print("Number of subscription",len(os_data_y[os_data_y['subscribed']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['subscribed']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['subscribed']==1])/len(os_data_X))

#### Feature selection using RFE

In [None]:
df.columns.values

In [None]:
data_final_vars=df.columns.values.tolist()
y=['subscribed']
X=[i for i in data_final_vars if i not in y]

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg=LogisticRegression()

rfe=RFE(logreg,20)
rfe=rfe.fit(os_data_X,os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

The RFE has helped us to select the following features:'previous', 'subscribed', 'job_admin.', 'job_blue-collar','job_entrepreneur', 'job_housemaid', 'job_management','job_retired', 'job_self-employed', 'job_services', 'job_student','job_technician', 'job_unemployed', 'job_unknown','marital_divorced', 'marital_married', 'marital_single','education_primary', 'education_secondary', 'default_no', 'default_yes'

In [None]:
cols=['previous','job_admin.', 'job_blue-collar','job_entrepreneur', 'job_housemaid', 'job_management','job_retired', 'job_self-employed', 'job_services', 'job_student','job_technician', 'job_unemployed', 'job_unknown','marital_divorced', 'marital_married', 'marital_single','education_primary', 'education_secondary', 'default_no', 'default_yes']
X=os_data_X[cols]
y=os_data_y['subscribed']


In [None]:
X.shape

In [None]:
X.head()

In [None]:
y.shape

#### Implementing the model

In [None]:
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

##### The p-values for all of the variables are smaller than 0.05. Hence we can consider these variables for our model.

In [None]:
# Final Columns for model building
X.columns.values

In [None]:
# Select Variables in test data
col_list=['previous', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student',
       'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'default_no',
       'default_yes']
df1_test_final=df1_test[col_list]
df1_test_final.head()

In [None]:
df1_test_final.shape

#### Split our data for training, validation and testing

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.3,random_state=21)
X_test=df1_test_final

#### Shapes of the datasets

In [None]:
X_train.shape,X_val.shape,y_train.shape,y_val.shape,X_test.shape

####  Building a logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()

In [None]:
logreg.fit(X_train,y_train)

#### Validate the Logistic Regression model

In [None]:
y_log_pred=logreg.predict(X_val)

In [None]:
from sklearn import metrics

print('Accuracy of logistic regression classifier on validation set:')
print(metrics.accuracy_score(y_val,y_log_pred))

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_val,y_log_pred))

In [None]:
print(classification_report(y_val,y_log_pred))

#### ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_val, logreg.predict(X_val))
fpr, tpr, thresholds = roc_curve(y_val, logreg.predict_proba(X_val)[:,1])
plt.figure(figsize=(12,8))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

#### Predicting the test dataset

In [None]:
y_pred_final=logreg.predict(X_test)

In [None]:
y_pred_final = pd.DataFrame(y_pred_final, columns = ['Subscribed_Predicted'])

In [None]:
y_pred_final.head()

In [None]:
pred_data=pd.concat([df_test,y_pred_final],axis=1)

In [None]:
pred_data['Subscribed_Predicted']=pred_data['Subscribed_Predicted'].replace({0: 'no', 1: 'yes'})

In [None]:
pred_data.head()

In [None]:
pred_data['Subscribed_Predicted'].value_counts()

In [None]:
sns.countplot(x='Subscribed_Predicted',data=pred_data,palette='husl')

#### Export Result

In [None]:
pred_data.to_csv("Term deposit prediction.csv")