In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline 

## Models
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
!pip install xgboost
# from xgboost import XGBClassifier
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

Collecting xgboost
  Downloading xgboost-1.7.4-py3-none-win_amd64.whl (89.1 MB)


ModuleNotFoundError: No module named 'xgboost'

In [None]:
df = pd.read_csv('Data_Science_Internship - Dump.csv').drop(['Unnamed: 0'], axis=1)
df.head()

In [None]:
df = df.drop(columns=['Agent_id', 'lead_id'], axis=1)

# set the columns to consider for null values
columns_to_consider = ['lost_reason', 'budget', 'lease', 'movein','source', 'source_city', 'source_country', 
                       'utm_medium','des_city', 'des_country']

# set the minimum number of non-null values required for each row to be kept
thresh = len(columns_to_consider) / 2

# remove rows where 50% of the columns being considered have null values
df = df.dropna(subset=columns_to_consider, thresh=thresh)

search_string = '9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6c9bc9d493a23be9de0' 

# Count the number of occurrences of the string in the DataFrame
count = (df == search_string).sum().sum()

# Print the result
print(f"The string '{search_string}' occurs {count} times in the DataFrame.")

In [None]:
# drop rows with status other than 'WON' or 'LOST'
df = df[df['status'].isin(['WON', 'LOST'])]

# replace long strings with NaN
cols_to_exclude = ['Agent_id'] # exclude the Agent_id column
df.loc[:, ~df.columns.isin(cols_to_exclude)] = df.loc[:, ~df.columns.isin(cols_to_exclude)].apply(lambda x: x.mask(x.apply(lambda y: str(y) == search_string)))

null_perc = df.isnull().sum() / len(df) * 100
null_perc = null_perc.fillna("No Null Values")

print('The percentage of NULL Values in differenct columns are:')
null_perc

In [None]:
df['status'].value_counts()

In [None]:
sns.countplot(df['status'])

In [None]:
# Now we will take care of null values in each column one by one, starting from the column 
# with highest percentage of null values.

In [None]:
df['room_type'].value_counts(normalize=True) * 100

In [None]:
# since most people opt for an 'Ensuite" room and almost 55% values are Ensuite in this column,
# we can use mode imputation.
df['room_type'] = df['room_type'].fillna("Ensuite")
#df = df.drop(['room_type'], axis =1)

In [None]:
df['movein'].describe()

In [None]:
df['movein'] = df['movein'].fillna("Unknown")

In [None]:
df['source_city'].describe()

In [None]:
df['source_city'] = df['source_city'].fillna("Not_disclosed")

In [None]:
df['source_country'].describe()

In [None]:
df['source_country'].value_counts(normalize=True) * 100

In [None]:
mode = df['source_country'].mode()[0]
df['source_country'] = df['source_country'].fillna(mode)

In [None]:
df['source'].describe()

In [None]:
df['source'].value_counts(normalize=True) * 100

In [None]:
mode = df['source'].mode()[0]
df['source'] = df['source'].fillna(mode)

In [None]:
df['budget'].describe()

In [None]:
df['budget'] = df['budget'].fillna("Not_specified")

In [None]:
df['utm_medium'].describe()

In [None]:
mode = df['utm_medium'].mode()[0]
df['utm_medium'] = df['utm_medium'].fillna(mode)

In [None]:
df['lost_reason'].describe()

In [None]:
df['lost_reason'] = df['lost_reason'].fillna("lead_won")

In [None]:
df['lease'].describe()

In [None]:
df['lease'] = df['lease'].fillna("unknown")

In [None]:
df['des_city'].describe()

In [None]:
df['des_city'] = df['des_city'].fillna("unknown")

In [None]:
df['des_country'].describe()

In [None]:
mode = df['des_country'].mode()[0]
df['des_country'] = df['des_country'].fillna(mode)

In [None]:
df['utm_source'].describe()

In [None]:
mode = df['utm_source'].mode()[0]
df['utm_source'] = df['utm_source'].fillna(mode)

In [None]:
df.isnull().sum()

In [None]:
# Encode categorical features
le = LabelEncoder()
for col in df.select_dtypes(include=['object']):
    df[col] = le.fit_transform(df[col])

In [None]:
X = df.drop(['status'], axis=1)
y = df['status']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, y_test = train_test_split(X, y, test_size=0.33, stratify = y, random_state=42)
print(X_train.shape)
print(X_test.shape)

##### Balancing the dataset using SMOTE method

In [None]:
# Since our classes are highly skewed we should make them equivalent in order to have a normal distribution of the classes.

from imblearn.over_sampling import SMOTE

smt = SMOTE(random_state=42)

X_res, y_res = smt.fit_resample(X_train, Y_train)

X_res.shape

In [None]:
# importing sklearn StandardScaler class which is for Standardization
from sklearn.preprocessing import StandardScaler

sc = StandardScaler() # creating an instance of the class object
X_res_scaled = pd.DataFrame(sc.fit_transform(X_res), columns=X_res.columns)  #fit and transforming StandardScaler the dataframe 

In [None]:
from sklearn .metrics import f1_score

sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_res_scaled, y_res)
y_pred = sgd.predict(X_test)
score_sgd = f1_score(y_test, y_pred, average='weighted')

logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_res_scaled, y_res)
y_pred = logreg.predict(X_test)
score_log = f1_score(y_test, y_pred, average='weighted')

knn = KNeighborsClassifier(n_neighbors = 3) 
knn.fit(X_res_scaled, y_res)  
y_pred = knn.predict(X_test)  
score_knn = f1_score(y_test, y_pred, average='weighted')

gaussian = GaussianNB() 
gaussian.fit(X_res_scaled, y_res)  
y_pred = gaussian.predict(X_test)
score_gaussian = f1_score(y_test, y_pred, average='weighted')

perceptron = Perceptron(max_iter=100)
perceptron.fit(X_res_scaled, y_res)
y_pred = perceptron.predict(X_test)
score_perceptron = f1_score(y_test, y_pred, average='weighted')

decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_res_scaled, y_res)  
y_pred = decision_tree.predict(X_test)  
score_decision_tree = f1_score(y_test, y_pred, average='weighted')

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_res_scaled, y_res)
y_pred = random_forest.predict(X_test)
score_random_forest = f1_score(y_test, y_pred, average='weighted')

XGBoost = XGBClassifier()
XGBoost.fit(X_res_scaled, y_res)
y_pred = XGBoost.predict(X_test)
score_XGBoost = f1_score(y_test, y_pred, average='weighted')

from sklearn.neural_network import MLPClassifier

MLPClassifier = MLPClassifier()
MLPClassifier.fit(X_res_scaled, y_res)
y_pred = MLPClassifier.predict(X_test)
score_MLPClassifier = f1_score(y_test, y_pred, average='weighted')

# comparing all the models

results = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression','Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent','Decision Tree', 'XG Boost', 'Neural Network'],
    'Weighted F1': [score_knn, score_log, score_random_forest, score_gaussian, score_perceptron, score_sgd, 
              score_decision_tree, score_XGBoost, score_MLPClassifier]})
result_df = results.sort_values(by='Weighted F1', ascending=False)
result_df = result_df.set_index('Weighted F1')
result_df

In [None]:
from sklearn .metrics import recall_score

sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_res_scaled, y_res)
y_pred = sgd.predict(X_test)
score_sgd = recall_score(y_test, y_pred)

logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_res_scaled, y_res)
y_pred = logreg.predict(X_test)
score_log = recall_score(y_test, y_pred)

knn = KNeighborsClassifier(n_neighbors = 3) 
knn.fit(X_res_scaled, y_res)  
y_pred = knn.predict(X_test)  
score_knn = recall_score(y_test, y_pred)

gaussian = GaussianNB() 
gaussian.fit(X_res_scaled, y_res)  
y_pred = gaussian.predict(X_test)
score_gaussian = recall_score(y_test, y_pred)

perceptron = Perceptron(max_iter=100)
perceptron.fit(X_res_scaled, y_res)
y_pred = perceptron.predict(X_test)
score_perceptron = recall_score(y_test, y_pred)

decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_res_scaled, y_res)  
y_pred = decision_tree.predict(X_test)  
score_decision_tree = recall_score(y_test, y_pred)

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_res_scaled, y_res)
y_pred = random_forest.predict(X_test)
score_random_forest = recall_score(y_test, y_pred)

XGBoost = XGBClassifier()
XGBoost.fit(X_res_scaled, y_res)
y_pred = XGBoost.predict(X_test)
score_XGBoost = recall_score(y_test, y_pred)

from sklearn.neural_network import MLPClassifier

MLPClassifier = MLPClassifier()
MLPClassifier.fit(X_res_scaled, y_res)
y_pred = MLPClassifier.predict(X_test)
score_MLPClassifier = recall_score(y_test, y_pred)

# comparing all the models

results = pd.DataFrame({
    'Model': ['KNN', 'Logistic Regression','Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent','Decision Tree', 'XG Boost', 'Neural Network'],
    'Recall': [score_knn, score_log, score_random_forest, score_gaussian, score_perceptron, score_sgd, 
              score_decision_tree, score_XGBoost, score_MLPClassifier]})
result_df = results.sort_values(by='Recall', ascending=False)
result_df = result_df.set_index('Recall')
result_df

In [None]:
from sklearn.neural_network import MLPClassifier

MLPClassifier = MLPClassifier()
MLPClassifier.fit(X_res_scaled, y_res)
y_pred = MLPClassifier.predict(X_test)
y_prob = MLPClassifier.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid to search over
param_grid = {
    'hidden_layer_sizes': [(10,), (20,), (30,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive']
}

# Define the grid search object
rand_search = RandomizedSearchCV(MLPClassifier, param_distributions=param_grid, cv=5, scoring='recall')

# Fit the grid search object to the training data
rand_search.fit(X_res_scaled, y_res)

# Get the best model and evaluate on the test data
best_mlp = rand_search.best_estimator_
y_pred = best_mlp.predict(X_test)
y_prob = best_mlp.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Print evaluation metrics
print('Accuracy: {:.4f}'.format(accuracy))
print('Precision: {:.4f}'.format(precision))
print('Recall: {:.4f}'.format(recall))
print('F1 score: {:.4f}'.format(f1))
print('AUC: {:.4f}'.format(auc))
print('Confusion matrix: ')
print(confusion_matrix(y_test, y_pred))
print('True negative: {}'.format(tn))
print('False positive: {}'.format(fp))
print('False negative: {}'.format(fn))
print('True positive: {}'.format(tp))

In [None]:
# Predict probabilities for the test set
y_pred_proba = best_mlp.predict_proba(X)[:, 1]

# Convert probabilities to scores between 0 to 100
lead_score = (y_pred_proba * 100).round(0)

# Add the 'lead_score' column to the original dataframe
df['lead_score'] = lead_score

In [None]:
df.head()