In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
pio.renderers.default = "svg" # to be replaced by "iframe" if working on JULIE
from IPython.display import display

In [2]:
df=pd.read_csv('C:/Users/serda/OneDrive/Bureau/Online Education/Certification/Conversion Rate/conversion_data_train.csv')


In [3]:
df= df[df['country']!='China']

In [5]:
df= df.drop('age', axis=1)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

In [7]:
X=df.drop('converted', axis=1)
y=df['converted']

In [8]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [9]:
categorical_features = ['country', 'source'] 
categorical_transformer = Pipeline(steps=[('encoder', OneHotEncoder(drop='first'))])

numeric_features = ['new_user', 'total_pages_visited']
scaler=StandardScaler()

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [11]:
X_train = preprocessor.fit_transform(X_train)

In [12]:
X_test = preprocessor.transform(X_test)

In [13]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [14]:
model = LogisticRegression()

In [15]:
model.fit(X_train,y_train)

LogisticRegression()

In [16]:
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)


In [17]:
print("f1-score on train set : ", f1_score(y_train, y_train_pred))
print("f1-score on test set : ", f1_score(y_test, y_pred))

f1-score on train set :  0.7495581477553905
f1-score on test set :  0.7567567567567569


In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
C= np.logspace(0.01, 0.1, 1, 10)

In [23]:
param_grid = {'C': C}

In [24]:
model_Grid= GridSearchCV(model, param_grid=param_grid, cv=5)


In [27]:
model_Grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': array([1.02329299])})

In [28]:
y_pred_grid = model_Grid.predict(X_test)
y_train_pred_grid = model_Grid.predict(X_train)
print("f1-score on train set : ", f1_score(y_train, y_train_pred_grid))
print("f1-score on test set : ", f1_score(y_test, y_pred_grid))

f1-score on train set :  0.7495581477553905
f1-score on test set :  0.7567567567567569


In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
rfc = RandomForestClassifier(random_state=42)

In [32]:
rfc.fit(X_train,y_train)

RandomForestClassifier(random_state=42)

In [33]:
y_pred_rfc = rfc.predict(X_test)
y_train_pred_rfc = rfc.predict(X_train)

In [34]:
print("f1-score on train set : ", f1_score(y_train, y_train_pred_rfc))
print("f1-score on test set : ", f1_score(y_test, y_pred_rfc))

f1-score on train set :  0.7586088449978533
f1-score on test set :  0.7614496216646753


# Best Classifier All Data

In [35]:
# Concatenate our train and test set to train your best classifier on all data with labels
X = np.append(X_train,X_test,axis=0)
y = np.append(y_train,y_test)

In [36]:
model_Grid.fit(X,y)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': array([1.02329299])})

In [43]:
data_without_labels = pd.read_csv('C:/Users/serda/OneDrive/Bureau/Online Education/Certification/Conversion Rate/conversion_data_test.csv')

In [44]:
print('Prediction set (without labels) :', data_without_labels.shape)

Prediction set (without labels) : (31620, 5)


In [45]:
data_without_labels=data_without_labels[data_without_labels['country']!='China']

In [46]:
data_without_labels= data_without_labels.drop('age', axis=1)

In [47]:
###features_list = ['total_pages_visited']
X_without_labels = data_without_labels
X_without_labels.head()

Unnamed: 0,country,new_user,source,total_pages_visited
0,UK,0,Seo,16
1,UK,1,Direct,5
3,US,1,Ads,6
5,US,1,Seo,1
7,UK,0,Seo,1


In [48]:
X_without_labels=preprocessor.transform(X_without_labels)

In [49]:
# Make predictions and dump to file
# WARNING : MAKE SURE THE FILE IS A CSV WITH ONE COLUMN NAMED 'converted' AND NO INDEX !
# WARNING : FILE NAME MUST HAVE FORMAT 'conversion_data_test_predictions_[name].csv'
# where [name] is the name of your team/model separated by a '-'
# For example : [name] = AURELIE-model1
data = {
    'converted': model_Grid.predict(X_without_labels)
}

Y_predictions = pd.DataFrame(columns=['converted'],data=data)
Y_predictions.to_csv('conversion_data_test_predictions_Serdar_model2.csv', index=False)

In [50]:
all_y_pred= model_Grid.predict(X_without_labels)

In [60]:
print("f1-score on test set : ", f1_score(y_test_sample, all_y_pred))

f1-score on test set :  0.040290990486849476


In [55]:
all_y_pred.shape

(24140,)

In [59]:
y_test_sample= y_test.sample(24140)

In [61]:
y.shape

(215458,)