# Importing the relevant libraries

In [None]:
### Python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore")
import statsmodels.api as sm

### Data Preprocessing
### Data:
     data about 365datascience student data .It contains country,days_on_platform,mins_watched,course_started,practice_exam_passed, minutes_spent_on_exams,purchased

### Goal:
    Data is 365datascience related course . actually 365datascience.com is provide best courses. they provide few minutes of course content free. but many contents are paid . but price is affordable. so, our goal to predict will student purchase course or not

### Importing the Database

In [None]:
df = pd.read_csv("ml_datasource.csv")
df.head()

In [None]:
## Dtypes, counts, variables shows by
df.info()

In [None]:
# describtive statistics 
df.describe(include="all")

In [None]:
for i in df.columns:
    print(df[i].value_counts())

### Removing Outliers

In [None]:
plt.style.available

In [None]:
plt.style.use('fivethirtyeight')

In [None]:
plt.figure(figsize=(15,7))
df.drop("student_country",axis=1).plot.box(figsize=(15,7))
plt.title("Boxplot of dataFrame to detect outliers ")
plt.xticks(rotation=45)
plt.xlabel("columns")
plt.tight_layout()
plt.savefig("Boxplot of dataFrame to detect outliers.png")
plt.show()

In [None]:
df[["minutes_watched","purchased"]].plot.box()
plt.title("Boxplot of dataFrame to detect outliers ")
plt.xticks(rotation=45)
plt.xlabel("columns")
plt.tight_layout()
plt.savefig("Boxplot of dataFrame to detect outliers(2-variables).png")
plt.show()

In [None]:
def outlier_removal(col):
    q1 = df[col].quantile(.25)
    q3 = df[col].quantile(.75)
    iqr = q3 - q1
    low = q1 - 1.5*iqr
    high = q3+1.5*iqr 
    return df[(df[col]>low)&(df[col]<high)]

In [None]:
original_data = df.copy() ### copying data

In [None]:
df = outlier_removal("minutes_watched")

In [None]:
df.columns

In [None]:
df.drop("student_country",axis=1).plot.box(figsize=(15,7))
plt.title("Boxplot of dataFrame to detect outliers ")
plt.xticks(rotation=45)
plt.xlabel("columns")
plt.tight_layout()

### Checking for Multicollinearity

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(df.drop("student_country",axis=1).corr(),annot=True)
plt.title("Correlation Map")
plt.tight_layout()
plt.savefig("correlation Map")
plt.show()

In [None]:
df.drop("student_country",axis=1).corr()

### Dealing with NaN Values

In [None]:
## Data Cleaning - Remove null values 
def data_cleaning(col,var_type="categorical"):
    if var_type=="categorical":
        df[col].fillna(df[col].mode()[0],inplace=True)
    else:
        df[col].fillna(df[col].mean(),inplace=True)
        
data_cleaning("student_country")
df.isnull().sum()

In [None]:
if df.duplicated().sum() > 0:
    df.drop_duplicates(inplace=True)

In [None]:
df.head()

### Encoding the Data

In [None]:
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
label = OrdinalEncoder()
df["student_country_enc"] = label.fit_transform(np.array(df["student_country"]).reshape(-1,1))

In [None]:
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
label = LabelEncoder()
df["student_country_new_enc"] = label.fit_transform(df["student_country"])

In [None]:
df.head()

### Splitting the Data

In [None]:
X = df.drop(["purchased","student_country","student_country_new_enc"],axis=1)
y = df["purchased"]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,stratify=y)

In [None]:
X_train

In [None]:
from imblearn.over_sampling import SMOTE,RandomOverSampler
smote = SMOTE()
X_train,y_train = smote.fit_resample(X_train,y_train)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

# Creating a Logistic Regression Model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([("model",LogisticRegression())])
pipe.fit(X_train,y_train)

In [None]:
print(pipe)

In [None]:
pipe.get_params()

In [None]:
pipe.score(X_test,y_test)

In [None]:
pipe.score(X_train,y_train)

In [None]:
### Evaluation Matrix:

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,ConfusionMatrixDisplay,roc_auc_score,roc_curve,RocCurveDisplay
def Evaluation_matrix(y_test,y_pred):
    accuracy = accuracy_score(y_test,y_pred)
    report = classification_report(y_test,y_pred)
    score = roc_auc_score(y_test,y_pred)
    print("AccuracyScore:",accuracy_score(y_test,y_pred))
    print("\n classification_Report:",classification_report(y_test,y_pred))
    cm = confusion_matrix(y_test,y_pred)
    print("AucRocScore:",roc_auc_score(y_test,y_pred))
    pd.DataFrame({"accuracy":[accuracy],"auc_roc_score":[score]})
    fpr,tpr,threshold = roc_curve(y_test,y_pred)
    curve = RocCurveDisplay(fpr=fpr,tpr=tpr,estimator_name="LogisticRegression")
    plt.figure(figsize=(15,7))
    curve.plot(
    )
    plt.show(curve)
    sns.heatmap(cm,annot=True)
    plt.title("ConfusionMatrix")
    plt.tight_layout()
    return plt.show()

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
y_pred

In [None]:
Evaluation_matrix(y_test,y_pred)

In [None]:
model = LogisticRegression()
model.fit(X_train,y_train)
model.get_params()

In [None]:
from sklearn.model_selection import cross_val_score,GridSearchCV

param = {'C': [1.0,0.1],
 'fit_intercept': [True,False],
 'max_iter': [100,200],
 'multi_class': ['auto'],
 'n_jobs': [1,-1],
 'penalty': ["l1",'l2'],
 'random_state': [20,30],
 'solver': ['lbfgs',"linear"]}

grid = GridSearchCV(model,param,cv=5,scoring="accuracy")
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
y_pred_grid_lr = grid.predict(X_test)

In [None]:
Evaluation_matrix(y_test,y_pred_grid_lr)

In [None]:
# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit logistic regression model
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Print summary including LLR p-value
print(result.summary())

# Filter only statistically significant variables
significant_variables = result.pvalues[result.pvalues < 0.05].index
X_significant = X[significant_variables]

# Fit the final model with significant variables
final_model = sm.Logit(y, X_significant)
final_result = final_model.fit()

# Print summary of the final model
print(final_result.summary())


# Creating a K-Nearest Neighbors Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)

In [None]:
knn.get_params()

In [None]:
knn.score(X_train,y_train)

In [None]:
knn.score(X_test,y_test)

In [None]:
y_pred = knn.predict(X_test)
Evaluation_matrix(y_test,y_pred)

In [None]:
### improve it 
knn_scores = []
for i in range(1,50):
    knn_iter = KNeighborsClassifier(n_neighbors=i)
    knn_iter.fit(X_train,y_train)
    y_pred = knn_iter.predict(X_test)
    knn_scores.append(accuracy_score(y_test,y_pred))

In [None]:
plt.figure(figsize=(15,7))
plt.plot(range(1,50),knn_scores)
plt.xlim(10,20)
plt.title("Best Score Estimation")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=16)
knn.fit(X_train,y_train)

In [None]:
y_pred = knn.predict(X_test)
Evaluation_matrix(y_test,y_pred)

# Creating a Support Vector Machines Model

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train,y_train)

In [None]:
y_pred = svc.predict(X_test)

In [None]:
Evaluation_matrix(y_test,y_pred)

In [None]:
svc.get_params()

In [None]:
param = {'C': [1.0,10,100],

 'gamma': ['scale','auto'],
 'kernel': ['rbf','linear']}

grid_svc = GridSearchCV(svc,param,cv=3)
grid_svc.fit(X_train,y_train)
grid_svc.best_estimator_


In [None]:
grid_svc.best_score_

In [None]:
y_pred = grid_svc.predict(X_test)
Evaluation_matrix(y_test,y_pred)

# Creating a Decision Trees Model

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
DT = DecisionTreeClassifier()
DT.fit(X_train,y_train)
y_pred = DT.predict(X_test)
Evaluation_matrix(y_test,y_pred)

In [None]:
print(len(X_train.columns))
print(len(DT.feature_importances_))

In [None]:
plt.figure(figsize=(15,7))
plt.bar(X_train.columns,DT.feature_importances_)
plt.xlabel("Features")
plt.ylabel("Values")
plt.xticks(rotation=90)
plt.title("Feature_Importances_")
plt.show()

# Creating a Random Forests Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
Evaluation_matrix(y_test,y_pred)

In [None]:
rf.get_params()

In [None]:
rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
Evaluation_matrix(y_test,y_pred)

In [None]:
params = {'bootstrap': [True,False],
 'ccp_alpha': [0.001,0.002,0.003,.004,.005],
 'criterion': ['gini','entrophy'],
 'n_estimators': [100,200,300,400,500],
 'oob_score': [True,False]}

grid_rf = GridSearchCV(rf,params,cv=3)
grid_rf.fit(X_train,y_train)

In [None]:
grid_rf.best_estimator_

In [None]:
grid_rf.best_score_


In [None]:
y_pred = grid_rf.predict(X_test)
Evaluation_matrix(y_test,y_pred)

In [None]:
### Save model 
## best_model 

import joblib 

joblib.dump(grid_svc,"365ClassificationModel")