### Data Analysis and Visualization

##### Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

##### Reading Dataset

In [None]:
data = pd.read_csv("online_shoppers_intention.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

##### Univariate Analysis

In [None]:
plt.figure(figsize=(10,10))

plt.subplot(1,2,1)
data["Revenue"].value_counts().plot(kind="pie",autopct="%0.1f%%")

plt.subplot(1,2,2)
data["VisitorType"].value_counts().plot(kind="pie",autopct="%0.1f%%")
plt.show()

Inference : We can observe that 84.5% of the visitors didn't shop while 85.6% will return again

In [None]:
sns.histplot(data["Month"])
plt.show()

Inference : The visitors are high in May and November months.

##### Bivariate analysis

In [None]:
plt.figure(figsize=(15,5))

plt.subplot(1,3,1)
plt.xlabel("Administrative pages")
plt.ylabel("Administrative Duration")
plt.scatter(data["Administrative"],data["Administrative_Duration"])


plt.subplot(1,3,2)
plt.xlabel("Informational pages")
plt.ylabel("Informational Duration")
plt.scatter(data["Informational"],data["Informational_Duration"],color="m")

plt.subplot(1,3,3)
plt.xlabel("Product-Related pages")
plt.ylabel("Product-Related Duration")
plt.scatter(data["ProductRelated"],data["ProductRelated_Duration"],color="g")

plt.show()

Inference : 1 - We can observe administarative duration mostly lies in between 0 to 1000 and the pages visited between 0-10 mostly.

2 - whereas in informational we can the duration is concentrated more between 0 to 1000 and pages visited is between 0 to 10

3 - In the case of product related we can see that the duration is mostly between 0 to 10000 and pages are like 0 to 400 mostly

In [None]:
pd.crosstab(data["SpecialDay"],data["Revenue"])

Inference : When special event <0.6 then less revenue is generated compared to >0.6 . But overall the revenue generated is higher on normal days

In [None]:
sns.barplot(x="Month",y="Revenue",data=data,ci=False)
plt.show()

Inference : High revenue is generated in November, October, September, August months compared to others

### Multivariate Analysis

In [None]:
pd.crosstab([data["Month"],data["VisitorType"]],data["Revenue"])

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(data.corr(numeric_only=True),annot=True)

##### Descriptive Analysis

In [None]:
data.describe()

### Data preprocessing

#### Handling Null values if any

In [None]:
data.isnull().any()

In [None]:
data.isnull().sum()

#### Handling categorical values -- Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
data["Month"] = le.fit_transform(data["Month"])
data["VisitorType"] = le.fit_transform(data["VisitorType"])
data["Weekend"] = le.fit_transform(data["Weekend"])
data["Revenue"] = le.fit_transform(data["Revenue"])

#### Dropping unwanted features -- Kmeans(unsupervised) so target feature must be removed

In [None]:
data_k = data.drop("Revenue",axis=1)

#### Feature Scaling/Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
sc = MinMaxScaler()

In [None]:
data_k1 = sc.fit_transform(data_k)

In [None]:
data_k = pd.DataFrame(data_k1,columns=data_k.columns)
data_k.head()

## Model Building

### Unsupervised ML - KMeans

#### Elbow method

In [None]:
from sklearn.cluster import KMeans

In [None]:
wcss = []
for k in range(1,11):
    kmeans = KMeans(n_clusters=k,init="k-means++",random_state=0)
    kmeans.fit(data_k)
    wcss.append(kmeans.inertia_)

In [None]:
wcss

In [None]:
k=list(range(1,11))
plt.plot(k,wcss,"m-o")
plt.title("k vs wcss")

#### Initializing the model

In [None]:
km = KMeans(n_clusters=4,init="k-means++",random_state=0)

In [None]:
y_pred_km = km.fit_predict(data_k)

#### Dimensionality Reduction using Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2)
dfPCA = pca.fit_transform(data_k)
dfPCA

In [None]:
dfPCA = pd.DataFrame(dfPCA,columns=["PCA 1","PCA 2"])
dfPCA.head()

In [None]:
dfPCA["Cluster"] = y_pred_km

In [None]:
#Visualizing data points using seaborn

plt.figure(figsize=(10,10))
sns.scatterplot(x="PCA 1",y="PCA 2",hue = dfPCA["Cluster"],palette=["red","yellow","green","blue"],data=dfPCA)
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color="black",s=300,marker="*",label="centroid")
plt.legend()

### Supervised ML

In [None]:
#Splitting dataset
from sklearn.model_selection import train_test_split

x = data.drop("Revenue",axis=1)
y = data["Revenue"]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=10)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
def logisticReg(x_train,x_test,y_train,y_test):
    lr = LogisticRegression()
    lr.fit(x_train,y_train)
    yPred = lr.predict(x_test)
    print("Logistic Regression : ")
    print("Confusion matrix")
    print(confusion_matrix(y_test,yPred))
    print("Classification Report : ")
    print(classification_report(y_test,yPred))
    

In [None]:
logisticReg(x_train,x_test,y_train,y_test)

In [None]:
def randomForest(x_train,x_test,y_train,y_test):
    rf = RandomForestClassifier()
    rf.fit(x_train,y_train)
    yPred = rf.predict(x_test)
    print("RandomForestClassifier : ")
    print("Confusion matrix")
    print(confusion_matrix(y_test,yPred))
    print("Classification report")
    print(classification_report(y_test,yPred))
    return rf

In [None]:
rf=randomForest(x_train,x_test,y_train,y_test)

### Saving the model

In [None]:
import pickle
pickle.dump(rf,open("model.pkl","wb"))