In [7]:
#pip install imblearn
print("Step 1: Required librarie imported successfully")


import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from imblearn.over_sampling import SMOTE

from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB



####################
# To ignore warning#
####################

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)


################################################
# Loading online_shoppers_intention.csv dataset#
################################################

print("Step 2: Created DataFrame successfully")

df = pd.read_csv("A:/Csv_files/online_shoppers_intention/online_shoppers_intention.csv")



######################
# Feature Engineering#
######################

print("Step 3: Feature Engineering Done successfully on Weekend, Revenue")

df['Weekend'] = df['Weekend'].replace((True, False), (1, 0))
df['Revenue'] = df['Revenue'].replace((True, False), (1, 0))

condition = df['VisitorType']=='Returning_Visitor'


#################################
# Added Returning_Visitor column#
#################################

print("Step 4: Added Returning_Visitor column successfully")

df['Returning_Visitor'] = np.where(condition, 1, 0)

df = df.drop(columns=['VisitorType'])


############################################
# Applying One Hot Encoding on Month column#
############################################

print("Step 5: Applied one hot encoding successfully on Month column")

ordinal_encoder = OrdinalEncoder()
df['Month'] = ordinal_encoder.fit_transform(df[['Month']])


#########################################
# Checking correlation on Revenue column#
#########################################

print("Step 6: Checking correlation done successfully")

result = df[df.columns[1:]].corr()['Revenue']						
result1 = result.sort_values(ascending=False)


###########################################
# Prepairing Features as X and target as y#
###########################################

print("Step 7: Prepairing features as X and target as y done successfully")

X = df.drop(['Revenue'], axis=1)
y = df['Revenue']



####################################
# Prepairing Train and Test Dataset#
####################################

print("Step 8: Splitting data X_train, X_test, y_train & y_test done successfully")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)


#################
# Model Pipeline#
#################

print("Step 9: model_pipeline fcuntion created done successfully")

def model_pipeline(X, model):  
    n_c = X.select_dtypes(exclude=['object']).columns.values.tolist()
    c_c = X.select_dtypes(include=['object']).columns.values.tolist()

    numeric_columns = n_c
    categorical_columns = c_c

    numeric_pipeline = SimpleImputer(strategy = 'constant')

    categorical_pipeline = OneHotEncoder(handle_unknown = 'ignore')

    a = ('numeric', numeric_pipeline, numeric_columns)
    b = ('categorical', categorical_pipeline, categorical_columns)

    preprocessor = ColumnTransformer(

    transformers = [a, b], 
    remainder = 'passthrough'

    )

    c = ('preprocessor', preprocessor)
    d = ('smote', SMOTE(random_state = 1))
    e = ('scaler', MinMaxScaler())
    f = ('feature_selection', SelectKBest(score_func = chi2, k = 6))
    g = ('model', model)

    bundled_pipeline = imbpipeline(steps = [c, d, e, f, g])

    return bundled_pipeline

##################
# Model Selection#
##################


print("Step 10: select_model fcuntion created done successfully")


def select_model(X, y, pipeline=None):

    classifiers = {}
    

    c_d4 = {"RandomForestClassifier": RandomForestClassifier()}
    classifiers.update(c_d4)

    c_d5 = {"DecisionTreeClassifier": DecisionTreeClassifier()}
    classifiers.update(c_d5)

    c_d9 = {"KNeighborsClassifier": KNeighborsClassifier()}
    classifiers.update(c_d9)

    c_d10 = {"RidgeClassifier": RidgeClassifier()}
    classifiers.update(c_d10)

    c_d13 = {"BernoulliNB": BernoulliNB()}
    classifiers.update(c_d13)

    c_d14 = {"SVC": SVC()}
    classifiers.update(c_d14)
   
    cols = ['model', 'run_time', 'roc_auc']
    df_models = pd.DataFrame(columns = cols)

    for key in classifiers:
        
        start_time = time.time()
        
        print()
        print("Step 12: model_pipeline run successfully on", key)

        pipeline = model_pipeline(X_train, classifiers[key])
        
        cv = cross_val_score(pipeline, X, y, cv=10, scoring='roc_auc')

        row = {'model': key,
               'run_time': format(round((time.time() - start_time)/60,2)),
               'roc_auc': cv.mean(),
        }

        df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)
        
    df_models = df_models.sort_values(by='roc_auc', ascending=False)
	
    return df_models
    

#####################################
# Access Model select_model function#
#####################################

print("Step 11: Accessing select_model function done successfully")


models = select_model(X_train, y_train)


###################################
# Lets see total model with score #
###################################

print("Step 13: Accessing select_model function done successfully")

print(models)

#####################################
# Accessing best model and training #
#####################################

print("Step 14: Accessing select_model function done successfully")

selected_model = SVC()
bundled_pipeline = model_pipeline(X_train, selected_model)
bundled_pipeline.fit(X_train, y_train)

#####################################
# Accessing best model and training #
#####################################

print("Step 15: Results predicted successfully")
y_pred = bundled_pipeline.predict(X_test)
print(y_pred)

#####################
# ROC and AOC score #
#####################

print("Step 16: ROC and AOC scores")

roc_auc = roc_auc_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)


p;;rint('ROC/AUC:', roc_auc)
print('Accuracy:', accuracy)
print('F1 score:', f1_score)



#########################
# Classification report #
#########################

print("Step 17: classification report generated successfully")

classif_report = classification_report(y_test, y_pred)

print(classif_report)

########################################
# BOSS its a right time to celebrate :)#
########################################



Step 1: Required librarie imported successfully
Step 2: Created DataFrame successfully
Step 3: Feature Engineering Done successfully on Weekend, Revenue
Step 4: Added Returning_Visitor column successfully
Step 5: Applied one hot encoding successfully on Month column
Step 6: Checking correlation done successfully
Step 7: Prepairing features as X and target as y done successfully
Step 8: Splitting data X_train, X_test, y_train & y_test done successfully
Step 9: model_pipeline fcuntion created done successfully
Step 10: select_model fcuntion created done successfully
Step 11: Accessing select_model function done successfully

Step 12: model_pipeline run successfully on RandomForestClassifier


  df_models = pd.concat([df_models, pd.DataFrame([row])], ignore_index=True)



Step 12: model_pipeline run successfully on DecisionTreeClassifier

Step 12: model_pipeline run successfully on KNeighborsClassifier

Step 12: model_pipeline run successfully on RidgeClassifier

Step 12: model_pipeline run successfully on BernoulliNB

Step 12: model_pipeline run successfully on SVC
Step 13: Accessing select_model function done successfully
                    model run_time   roc_auc
0  RandomForestClassifier     0.62  0.886387
5                     SVC     0.58  0.885963
4             BernoulliNB     0.01  0.857851
3         RidgeClassifier     0.01  0.855441
2    KNeighborsClassifier     0.01  0.840505
1  DecisionTreeClassifier     0.02  0.733311
Step 14: Accessing select_model function done successfully
Step 15: Results predicted successfully
[0 0 0 ... 0 0 0]
Step 16: ROC and AOC scores
ROC/AUC: 0.7772768502330849
Accuracy: 0.8780751554474182
F1 score: 0.6330349877949553
Step 17: classification report generated successfully
              precision    recall  f1-sc

In [8]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Assuming 'X' is your feature matrix and 'y' is your target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a logistic regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8669910786699108


In [4]:
print("Step 1: Required librarie imported successfully")


import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from imblearn.over_sampling import SMOTE

from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB



####################
# To ignore warning#
####################

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

# Loading online_shoppers_intention.csv dataset#
################################################

print("Step 2: Created DataFrame successfully")

df = pd.read_csv("A:/Csv_files/online_shoppers_intention/online_shoppers_intention.csv")

print(df.head())
#print(df.info())
#print(df.describe())


######################
# Feature Engineering#
######################

print("Step 3: Feature Engineering Done successfully on Weekend, Revenue")

df['Weekend'] = df['Weekend'].replace((True, False), (1, 0))
df['Revenue'] = df['Revenue'].replace((True, False), (1, 0))

condition = df['VisitorType']=='Returning_Visitor'


#################################
# Added Returning_Visitor column#
#################################

print("Step 4: Added Returning_Visitor column successfully")

df['Returning_Visitor'] = np.where(condition, 1, 0)

df = df.drop(columns=['VisitorType'])



Step 1: Required librarie imported successfully
Step 2: Created DataFrame successfully
   Administrative  Administrative_Duration  Informational  \
0               0                      0.0              0   
1               0                      0.0              0   
2               0                      0.0              0   
3               0                      0.0              0   
4               0                      0.0              0   

   Informational_Duration  ProductRelated  ProductRelated_Duration  \
0                     0.0               1                 0.000000   
1                     0.0               2                64.000000   
2                     0.0               1                 0.000000   
3                     0.0               2                 2.666667   
4                     0.0              10               627.500000   

   BounceRates  ExitRates  PageValues  SpecialDay Month  OperatingSystems  \
0         0.20       0.20         0.0         0.

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
df=pd.read_csv('A:/Csv_files/Pakistan_dataset/Pakistan Largest Ecommerce Dataset.csv')
#print(patal_lok)
#df.head()
#df[df['Customer ID']==33.0]
#df.info()
#df.shape
df.isna().sum().sort_values(ascending=False)

  df=pd.read_csv('A:/Csv_files/Pakistan_dataset/Pakistan Largest Ecommerce Dataset.csv')


Unnamed: 25              1048575
Unnamed: 24              1048575
Unnamed: 23              1048575
Unnamed: 22              1048575
Unnamed: 21              1048575
sales_commission_code     601229
category_name_1           464215
sku                       464071
status                    464066
Customer ID               464062
Customer Since            464062
Year                      464051
FY                        464051
M-Y                       464051
Month                     464051
item_id                   464051
 MV                       464051
Working Date              464051
payment_method            464051
discount_amount           464051
increment_id              464051
grand_total               464051
qty_ordered               464051
price                     464051
created_at                464051
BI Status                 464051
dtype: int64

In [13]:
(df.isna().mean()*100).sort_values(ascending=False)

Unnamed: 25              100.000000
Unnamed: 24              100.000000
Unnamed: 23              100.000000
Unnamed: 22              100.000000
Unnamed: 21              100.000000
sales_commission_code     57.337720
category_name_1           44.271034
sku                       44.257302
status                    44.256825
Customer ID               44.256443
Customer Since            44.256443
Year                      44.255394
FY                        44.255394
M-Y                       44.255394
Month                     44.255394
item_id                   44.255394
 MV                       44.255394
Working Date              44.255394
payment_method            44.255394
discount_amount           44.255394
increment_id              44.255394
grand_total               44.255394
qty_ordered               44.255394
price                     44.255394
created_at                44.255394
BI Status                 44.255394
dtype: float64

In [14]:
pd.crosstab(index=df[df['sales_commission_code'].isna()]['status'],columns=df[df['sales_commission_code'].isna()]['BI Status'],margins=True)

BI Status,Gross,Net,Valid,All
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
canceled,59806,0,0,59806
closed,0,83,0,83
cod,0,0,1128,1128
complete,0,28307,0,28307
holded,25,0,0,25
order_refunded,0,0,11347,11347
paid,0,0,502,502
payment_review,31,0,0,31
pending,39,0,0,39
processing,31,0,0,31


In [19]:
df['category_name_1'].fillna(df['category_name_1'].mode()[0],inplace=True)
#print(df)

In [20]:
df['sku'].fillna("Missing",inplace=True)


In [2]:
#Ecommerce purchase project
1.Display Top 10 Rows of the Dataset
data.head(10)

2.check last 10 rows of the dataset
data.tail(10)

3.check datatype of each column
data.dtypes

4.check Null values in dataset
data.isnull().sum()

5.How many Rows and Columns are there in our dataset
len(data.columns)
len(data)
data.info()

6.Highest and lowest purchase prices
data.columns
data['Purchase Price'].max()
data['Purchase Price'].min()

7.Average Purchase Price
data['Purchase Price'].mean()

8.How many people Have French "fr" as their language?
data[data['language']=="fr"]
len(data[data['language']=="fr"])
data[data['language']=="fr"].count()

9.job Title Contains Engineer
len(data[data['job'].str.contains('engineer',case=False)])

10.Find Email of the persion with the following IP address:132.207.160.22
data[data['IP Address']=="132.207.160.22"]['Email']

len(data[(data['CC Provider']=="Mastercard") & data ['Purchase Price']>50)])  00r0r0r
data[(data['CC Provider']=="Mastercard") & (data ['Purchase Price']>50)].count()

12.Find Email of the persion with the following credit card number: 4664825258997302
data.columns
data[data['credit card']=="4664825258997302"]['Email']

13.How many people purchase during the AM and How many people purchase during pm
data.columns
data['AM or PM'].value_counts()

14.how many people have credit card thet expire in 2020
data.columns
len(data[data['CC Exp Date'].apply(lamda x:x[3:]=='20')])
or
def fun():
    count=0
    for date in data['CC Exp Date']:
        if date.split('/')[1]=='20':
            count=count+1
    
    print(count)    
fun()        

15. Top 5 Most Popular Email providers(eg.gmail.com,yahoo.com,etc)
list1=[]
for email in data['Email']:
    list1.append(email.split('@')[1])
data['temp']=list1
data.head(1)
data['temp'].value_counts().head()
#orrrrr
data['Email'].apply(lambda X:X.split('@')[1]).value_counts().head()

SyntaxError: invalid decimal literal (2205153000.py, line 2)

In [3]:
def fun():
    count=0
    for date in data['CC Exp Date']:
        if date.split('/')[1]=='20':
            count=count+1
    
    print(count)    
fun()        

NameError: name 'data' is not defined

In [None]:
import pandas as pd
data=pd.read_csv('salaries.csv')