# מבחן בית בדאטא סיינס לתעשייה 2020

In [None]:
#install 
!pip install imblearn

In [None]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
import sklearn.linear_model as sk
from sklearn.metrics import classification_report
import sklearn.metrics as metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


# --- Data Understanding ---
* **Load Dataset**
 

In [None]:
depression_data = pd.read_csv('../input/depression-1/depression_45901.csv')
depression_data

In [None]:
#Check columns: 
depression_data.columns

In [None]:
depression_data.dtypes

In [None]:
depression_data.describe()

In [None]:
depression_data['depressed'].value_counts()

**As we can see the class feature is imbalnced**

**info of each fetaure include histogram(MEAN/MAX/MIN value):**

In [None]:
for i in depression_data.columns:
    plt.figure()
    # Generate data on commute times.
    data = depression_data[i]
    data.plot.hist(grid=True, bins=30, rwidth=0.9,
                       color='#607c8e')
    plt.title(i)
    plt.xlabel('Actual Value')
    plt.ylabel('Counts')
    ##### ---- mean
    plt.axvline(data.mean(), color='k', linestyle='dashed', linewidth=2)

    #text added
    min_ylim, max_ylim = plt.ylim()
    plt.text(data.mean(), max_ylim*0.5, 'Mean: {:.2f}'.format(data.mean()))
    plt.text(data.max(), max_ylim*0.9, 'Max: {:.2f}'.format(data.max()))
    plt.text(data.min(), max_ylim*0.9, 'Min: {:.2f}'.format(data.min()))
    ####
    plt.grid(axis='y', alpha=0.75)

In [None]:
for col in depression_data.columns:
    sns.violinplot(x="depressed", y=col, data=depression_data)
    plt.show()



# --- Data preparation ---

nan values:

In [None]:
# Clean results
depression_data = depression_data.dropna()
depression_data.groupby('depressed').describe()
#resultQ3_fail

In [None]:
#Count nan values in depression_data
depression_data.isna().sum().sum()

**now we have 1409 records instead of 1429(after we removed the nan values)**

# Features Selection:

I choose to stay with the 10 features that have the strongest connection with the depression class

In [None]:
#1 ------ correlation matrix --------- 
plt.subplots(figsize=(20,15)) 
cor=depression_data.corr() 
sns.set(font_scale=0.8)
sns.heatmap(cor, annot = True, cmap=plt.cm.Reds)
plt.show()


In [None]:
#print 10 best from the corelation matrix using threshold of 0.2
cor_target = abs(cor["depressed"])

#Selecting best correlated features according to the threshold
relevant_features_cor = cor_target[cor_target>0.02]
relevant_features_cor


In [None]:
#2 ---------- Statistical tests (chi-squared for choosing the best k  features that have the strongest relationship with the output variable.)
data = depression_data
X = data.iloc[:,0:21]  #independent columns
y = data.iloc[:,-1]    #target column i.e price range
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest( k=10) #using Anova f-value
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best feature

In [None]:
features_df_new_cor = depression_data[['Age', 'Married','education_level', 'total_members',
                'durable_asset','living_expenses','incoming_business', 'incoming_no_business',
                'no_lasting_investmen','Ville_id', 'incoming_agricultural']]
features_df_new_cor

# --- Modeling & Evaluation ---

# Train & Test:

In [None]:
depression_data_depressed = depression_data['depressed']
X = features_df_new_cor
y = depression_data_depressed.astype('bool')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

In [None]:
#print size of train and test:
print("X - Train data shape: " , X_train.shape)
print("X - Test data shape: " , X_test.shape)
print("y - Train data shape: " , y_train.shape)
print("y - Test data shape: " , y_test.shape)

# 1. Logistic Regression

In [None]:
# train the model on train set 
model = sk.LogisticRegression().fit(X_train, y_train.ravel()) 
  
y_pred = model.predict(X_test) 
  
# print classification report 
log_acc = metrics.accuracy_score(y_test, y_pred)
log_recall = metrics.recall_score(y_test, y_pred)
print(classification_report(y_test, y_pred)) 
print("Accuracy:",log_acc)
print("recall:",log_recall)

In [None]:

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("True Negatives: ",tn)
print("False Positives: ",fp)
print("False Negatives: ",fn)
print("True Positives: ",tp)

# 2. KNN

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

In [None]:
knn_model = KNeighborsClassifier().fit(X_train, y_train)

y_pred = knn_model.predict(X_test)

# print classification report 
knn_acc = metrics.accuracy_score(y_test, y_pred)
knn_recall = metrics.recall_score(y_test, y_pred)
print(classification_report(y_test, y_pred)) 
print("Accuracy:",knn_acc)
print("recall:",knn_recall)

# 3. Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

In [None]:
rf_model = RandomForestClassifier().fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

# print classification report
random_acc = metrics.accuracy_score(y_test, y_pred)
random_recall = metrics.recall_score(y_test, y_pred)
print(classification_report(y_test, y_pred)) 
print("Accuracy:",random_acc)
print("recall:",random_recall)

**As we can see we got a great accuracy BUT we got a very low recall!!**

# Comparison Accuracy of Classification Algorithms

In [None]:
# taken from: https://www.kaggle.com/osmanozen/comparison-of-classification-algorithms
result = []
results_acc = pd.DataFrame(columns= ["Algorithms","Accuracy"])

result = pd.DataFrame([["LogisticRegression", log_acc*100]], columns= ["Algorithms","Accuracy"])
results_acc = results_acc.append(result)

result = pd.DataFrame([["KNeighborsClassifier", knn_acc*100]], columns= ["Algorithms","Accuracy"])
results_acc = results_acc.append(result)

result = pd.DataFrame([["RandomForest", random_acc*100]], columns= ["Algorithms","Accuracy"])
results_acc = results_acc.append(result)

In [None]:
sns.barplot(x= 'Accuracy', y = 'Algorithms', data=results_acc, color="b")
plt.xlabel('Accuracy %')
plt.title(' Comparison of Classification Algorithms');


# Comparison Recall of Classification Algorithms

In [None]:
# taken from: https://www.kaggle.com/osmanozen/comparison-of-classification-algorithms
result = []
results_recall = pd.DataFrame(columns= ["Algorithms","Recall"])

result = pd.DataFrame([["LogisticRegression", log_recall*100]], columns= ["Algorithms","Recall"])
results_recall = results_recall.append(result)

result = pd.DataFrame([["KNeighborsClassifier", knn_recall*100]], columns= ["Algorithms","Recall"])
results_recall = results_recall.append(result)

result = pd.DataFrame([["RandomForest", random_recall*100]], columns= ["Algorithms","Recall"])
results_recall = results_recall.append(result)

In [None]:
sns.barplot(x= 'Recall', y = 'Algorithms', data=results_recall, color="b")
plt.xlabel('Recall %')
plt.title(' Comparison of Classification Algorithms');

# 3. Improve results:

*Ass you can see we have got a bad recall.. though we got a great accuracy.. 
so we need to deal with the imbalandced data.. i am gonna use SMOTE to use these problem*

# Imbalanced Dataset:

i had help from these website: https://medium.com/@saeedAR/smote-and-near-miss-in-python-machine-learning-in-imbalanced-datasets-b7976d9a7a79

In [None]:
y_train.value_counts()


In [None]:
# well will use SMOTE only on the train sets! 
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)

In [None]:
y_train.value_counts()

now we can see that the x-train and y-train are balanced!!

# 3.1. Logistic Regression

modling after dealing with imbalnced class

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)


log_score_s = metrics.accuracy_score(y_test, y_pred)
log_recall_s = metrics.recall_score(y_test, y_pred)
print(classification_report(y_test, y_pred)) 
print("Accuracy:",log_score_s)
print("Recall:",log_recall_s)

In [None]:

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("True Negatives: ",tn)
print("False Positives: ",fp)
print("False Negatives: ",fn)
print("True Positives: ",tp)

# 3.2.KNN

In [None]:
knn_model = KNeighborsClassifier().fit(X_train, y_train)

y_pred = knn_model.predict(X_test)

# print classification report 
knn_score_s = metrics.accuracy_score(y_test, y_pred)
knn_recall_s = metrics.recall_score(y_test, y_pred)
print(classification_report(y_test, y_pred)) 
print("Accuracy:",knn_score_s)
print("recall:",knn_recall_s)

# 3.3 Random Forest

In [None]:
rf_model = RandomForestClassifier().fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

# print classification report 
random_score_s = metrics.accuracy_score(y_test, y_pred)
random_recall_s = metrics.recall_score(y_test, y_pred)
print(classification_report(y_test, y_pred)) 
print("Accuracy:",random_score_s)
print("recall:",random_recall_s)

# Comparison Accuracy of Classification Algorithms After SOMTE

In [None]:
# taken from: https://www.kaggle.com/osmanozen/comparison-of-classification-algorithms
result = []
results_acc_s = pd.DataFrame(columns= ["Algorithms","Accuracy"])

result = pd.DataFrame([["LogisticRegression", log_score_s*100]], columns= ["Algorithms","Accuracy"])
results_acc_s = results_acc_s.append(result)

result = pd.DataFrame([["KNeighborsClassifier", knn_score_s*100]], columns= ["Algorithms","Accuracy"])
results_acc_s = results_acc_s.append(result)

result = pd.DataFrame([["RandomForest", random_score_s*100]], columns= ["Algorithms","Accuracy"])
results_acc_s = results_acc_s.append(result)

In [None]:
sns.barplot(x= 'Accuracy', y = 'Algorithms', data=results_acc_s, color="b")
plt.xlabel('Accuracy %')
plt.title(' Comparison of Classification Algorithms');


# Comparison Recall of Classification Algorithms After SOMTE

In [None]:
# taken from: https://www.kaggle.com/osmanozen/comparison-of-classification-algorithms
result = []
results_recall_s = pd.DataFrame(columns= ["Algorithms","Recall"])

result = pd.DataFrame([["LogisticRegression", log_recall_s*100]], columns= ["Algorithms","Recall"])
results_recall_s = results_recall_s.append(result)

result = pd.DataFrame([["KNeighborsClassifier", knn_recall_s*100]], columns= ["Algorithms","Recall"])
results_recall_s = results_recall_s.append(result)

result = pd.DataFrame([["RandomForest", random_recall_s*100]], columns= ["Algorithms","Recall"])
results_recall_s = results_recall_s.append(result)

In [None]:
sns.barplot(x= 'Recall', y = 'Algorithms', data=results_recall_s, color="b")
plt.xlabel('Recall %')
plt.title(' Comparison of Classification Algorithms');