In [None]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

df = pd.read_csv('https://raw.githubusercontent.com/rselcuk/dataReduction/master/bank-additional-full.csv', sep=';')
df.info()
df.head()

In [None]:
#hedef değişkenin dağılımına bak
X = df.drop(['y'],axis=1)
y = df['y']
print('y Distribution:')
print(y.value_counts())
y = np.where(df['y']=='no',0,1)

In [None]:
#df['capital_gain'] = df['capital_gain'].astype(float)
X.select_dtypes(include='object').tail(20)

In [None]:
#veritipi object olan sütunları seç ve mode() değeriyle doldur
categorical_columns = [c for c in X.columns if X[c].dtype.name == 'object']
for c in categorical_columns:
  X[c] = np.where(X[c] == 'unknown', X[c].mode(), df[c])
X.select_dtypes(include='object').tail(20)

In [None]:
#one-hot-encoding yap
X = pd.concat([X, pd.get_dummies(X.select_dtypes(include='object'))],axis=1)
X = X.drop(['duration'],axis=1)
X = X.drop(['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome'],axis=1)
X.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix as cm

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)
rf = RandomForestClassifier(n_estimators=100, max_depth = 3, random_state=42)
rf.fit(X_train,y_train)

In [None]:
#modelin başarı metrikleri: Confusion Matrix
predictions = rf.predict(X_test)
score = round(accuracy_score(y_test,predictions),3)
cm1 = cm(y_test,predictions)
sns.heatmap(cm1, annot=True,fmt=".0f")
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Accuracy Score: {0}'.format(score),size=15)
plt.show()

In [None]:
#Modelin başarı metrikleri: Precision, recall, f1-score
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions,target_names=['yes','no']))

In [None]:
#karar ağaçlarından birini görselleştirme
from ipywidgets import Image
from io import StringIO
import pydotplus
from sklearn.tree import  export_graphviz

d_tree99 = rf.estimators_[99]
dot_data1 = StringIO()
export_graphviz(d_tree99, feature_names = X.columns, out_file = dot_data1, filled = True, rounded = True)
graph = pydotplus.graph_from_dot_data(dot_data1.getvalue())
graph.write_png('rf_5b.png')
Image(value = graph.create_png())

In [None]:
#Modelin özniteliklerinin önem sıralaması
rf = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42)
rf.fit(X_train,y_train)
predictions=rf.predict(X_test)
print(classification_report(y_test,predictions,target_names=['yes','no']))
plt.figure(figsize=(16,9))

ranking = rf.feature_importances_
features = np.argsort(ranking)[::-1][:10]
columns = X.columns

plt.title("Feature importances based on Random Forest Classifier",y=1.03,size=18)
plt.bar(range(len(features)),ranking[features],color="aqua",align="center")
plt.xticks(range(len(features)),columns[features],rotation=80)
plt.show()

In [34]:
df = pd.read_csv('https://raw.githubusercontent.com/rselcuk/dataReduction/master/bank-additional-full.csv', sep=';')
X = df.drop(['y'],axis=1)
y = df['y']
categorical_columns = [c for c in X.columns if X[c].dtype.name == 'object']
for c in categorical_columns:
  X[c] = np.where(X[c] == 'unknown', X[c].mode(), df[c])
X = pd.concat([X, pd.get_dummies(X.select_dtypes(include='object'))],axis=1)
X = X.drop(['duration'],axis=1)
X = X.drop(['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome'],axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) #Test ve eğitim verilerinin ayrılması
pca = PCA(n_components=6) #indirgenecek kolon sayısı
X_train_pca=pca.fit_transform(X_train) #X_train öğrenilip indirgeniyor
X_test_pca=pca.transform(X_test) #Daha önce öğrendiğini X_test için uyguluyor
lr = LogisticRegression(random_state=0) #Log. reg. algoritması ile test
lr.fit(X_train,y_train) #önce normal hali ile eğitim
lr_pca = LogisticRegression(random_state=0)
lr_pca.fit(X_train_pca,y_train) #2'ye indirgenmiş hali ile eğitim
y_pred = lr.predict(X_test) #Normal halini test ediliyor
y_pred_pca = lr_pca.predict(X_test_pca) #İndirgenmiş hali test ediliyor
print("PCA Kullanılmadan CM")
print(confusion_matrix(y_test,y_pred)) #İlki yani ham hali ile conf. matrisi
print(accuracy_score(y_test, y_pred)) #İlk hal başarı
print("PCA Kullanılan CM")
print(confusion_matrix(y_test,y_pred_pca)) #İndirgenmiş hali ile conf. matris
print(accuracy_score(y_test, y_pred_pca)) #İndirgenmiş hal ile accuracy skoru

PCA Kullanılmadan CM
[[10835   134]
 [ 1124   264]]
0.8981953548595938
PCA Kullanılan CM
[[10847   122]
 [ 1131   257]]
0.8985999838148417


In [32]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
df = pd.read_csv('https://raw.githubusercontent.com/rselcuk/dataReduction/master/bank-additional-full.csv', sep=';')
X = df.drop(['y'],axis=1)
y = df['y']
categorical_columns = [c for c in X.columns if X[c].dtype.name == 'object']
for c in categorical_columns:
  X[c] = np.where(X[c] == 'unknown', X[c].mode(), df[c])
X = pd.concat([X, pd.get_dummies(X.select_dtypes(include='object'))],axis=1)
X = X.drop(['duration'],axis=1)
X = X.drop(['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) #Test ve eğitim verilerinin ayrılması
lda = LDA(n_components = 2)
X_train_lda = lda.fit_transform(X_train,y_train)
X_test_lda = lda.transform(X_test)
lr_lda = LogisticRegression(random_state=0)
lr_lda.fit(X_train_lda,y_train)
y_pred_lda = lr_lda.predict(X_test_lda)
print("LDA Kullanılarak CM")
print(confusion_matrix(y_test,y_pred_lda))
print(accuracy_score(y_test,y_pred_lda))

LDA Kullanılarak CM
[[10787   182]
 [ 1076   312]]
0.8981953548595938
