In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df =  pd.read_csv('/kaggle/input/bank-marketing-campaigns-dataset/bank-additional-full.csv', sep=';')
df.describe()

In [None]:
df.columns

In [None]:
df.isnull().sum()

# Feature description
Bank client data:

1. - age (numeric)
2. - job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
3. - marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
4. - education (categorical: basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
5. - default: has credit in default? (categorical: 'no','yes','unknown')
6. - housing: has housing loan? (categorical: 'no','yes','unknown')
7. - loan: has personal loan? (categorical: 'no','yes','unknown')

In [None]:
dummy1 = pd.get_dummies(df['job'])

In [None]:
dfjob = dummy1.drop(['unknown'],axis=1)
dfjob.head()

In [None]:
dummy2 = pd.get_dummies(df['marital'])
maritaldf = dummy2.drop(['unknown'],axis=1)
maritaldf.head()

In [None]:
dummy3 = pd.get_dummies(df['education'])
educationdf = dummy3.drop(['unknown'],axis=1)
educationdf.head()

In [None]:
dummy3.isnull().sum()

In [None]:
dummy4 = pd.get_dummies(df['default'])
defaultdf = dummy4.drop(['yes'], axis=1)
defaultdf.head()

In [None]:
dummy5 = pd.get_dummies(df['housing'])
housingdf = dummy5.drop(['yes'], axis=1)
housingdf.head()

In [None]:
dummy6 = pd.get_dummies(df['loan'])
loandf = dummy6.drop(['yes'], axis=1)
loandf.head()

# related with the last contact of the current campaign:
11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

In [None]:
df['duration']

# other attributes:
12. - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)

13. - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)

14. - previous: number of contacts performed before this campaign and for this client (numeric)

15. - poutcome: outcome of the previous marketing campaign (categorical: "failure","nonexistent","success")

In [None]:
dummy8 = pd.get_dummies(df['poutcome'])
poutcomedf = dummy8.drop(['success'], axis=1)
poutcomedf.head()

# social and economic context attributes
16. - emp.var.rate: employment variation rate - quarterly indicator (numeric)

17. - cons.price.idx: consumer price index - monthly indicator (numeric)

18. - cons.conf.idx: consumer confidence index - monthly indicator (numeric)

19. - euribor3m: euribor 3 month rate - daily indicator (numeric)

20. - nr.employed: number of employees - quarterly indicator (numeric)

# Output
21. y - has the client subscribed a term deposit? (binary: "yes","no")

In [None]:
df['y'] = df['y'].map({'yes':1,'no':0})

In [None]:
df

In [None]:
merged = pd.concat([df, dfjob, educationdf, defaultdf, loandf, poutcomedf, housingdf, maritaldf],join='outer', axis=1)

In [None]:
merged.head()

In [None]:
data = merged.drop(['marital','job', 'education', 'default', 'loan', 'housing', 'contact', 'month', 'day_of_week', 'poutcome'], axis=1)

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = data.drop(['y'],axis=1)
y = data.y

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.20,random_state=5)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()
model.fit(xtrain, ytrain)

In [None]:
model.score(xtest, ytest)

In [None]:
model.intercept_

In [None]:
model.coef_

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(xtrain,ytrain, sample_weight=None, check_input=True, X_idx_sorted=None)
clf.get_params(deep=True)
clf.predict(xtest, check_input=True)
clf.predict_log_proba(xtest)
clf.predict(xtest,check_input=True)
print(clf.score(xtest,ytest, sample_weight=None))

In [None]:
modelNew=RandomForestClassifier(n_estimators=100)
modelNew.fit(xtrain, ytrain)

In [None]:
prediction = model.predict(xtest)

In [None]:
from sklearn import metrics

In [None]:
df=pd.DataFrame(prediction,ytest)
print(df)

In [None]:
metrics.accuracy_score(prediction,ytest)