In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

Matplotlib is building the font cache; this may take a moment.


In [2]:
df = pd.read_csv("preprod_dataset.csv")
df.head()

Unnamed: 0,Customer_id,Brand_name,Screen_size,Battery_capacity,Ram_size,Storage_capacity,Purchase_date,Price,Payment_type,Units_sold
0,4382JV,Apple,7.5,11250,12,512,2023/09/23,791,paypal,3532
1,tCTYRh,Xiaomi,7.7,11550,12,512,2023/06/13,792,credit card,3400
2,Mx77Cv,Xiaomi,6.0,9000,9,32,2024/04/25,66,cash,1638
3,2J38jY,Samsung,5.5,8250,9,250,2023/03/08,392,paypal,1176
4,r2Hh39,Oppo,4.9,7350,8,64,2023/03/31,111,credit card,2802


In [4]:
df.dtypes

Customer_id          object
Brand_name           object
Screen_size         float64
Battery_capacity      int64
Ram_size              int64
Storage_capacity      int64
Purchase_date        object
Price                 int64
Payment_type         object
Units_sold            int64
dtype: object

In [5]:
#  Has no missing values.
df.isnull().sum()

Customer_id         0
Brand_name          0
Screen_size         0
Battery_capacity    0
Ram_size            0
Storage_capacity    0
Purchase_date       0
Price               0
Payment_type        0
Units_sold          0
dtype: int64

In [6]:
df["Purchase_date"].nunique()

445

In [7]:
df["Purchase_date"]=pd.to_datetime(df["Purchase_date"])
df.dtypes

Customer_id                 object
Brand_name                  object
Screen_size                float64
Battery_capacity             int64
Ram_size                     int64
Storage_capacity             int64
Purchase_date       datetime64[ns]
Price                        int64
Payment_type                object
Units_sold                   int64
dtype: object

In [8]:
df["Purchase_date_year"]=df["Purchase_date"].dt.year
df["Purchase_date_month"]=df["Purchase_date"].dt.month
df["Purchase_date_day"]=df["Purchase_date"].dt.day
df.drop("Purchase_date",axis=1,inplace=True)
df.head()


Unnamed: 0,Customer_id,Brand_name,Screen_size,Battery_capacity,Ram_size,Storage_capacity,Price,Payment_type,Units_sold,Purchase_date_year,Purchase_date_month,Purchase_date_day
0,4382JV,Apple,7.5,11250,12,512,791,paypal,3532,2023,9,23
1,tCTYRh,Xiaomi,7.7,11550,12,512,792,credit card,3400,2023,6,13
2,Mx77Cv,Xiaomi,6.0,9000,9,32,66,cash,1638,2024,4,25
3,2J38jY,Samsung,5.5,8250,9,250,392,paypal,1176,2023,3,8
4,r2Hh39,Oppo,4.9,7350,8,64,111,credit card,2802,2023,3,31


In [9]:
df["Payment_type"].nunique()

4

In [10]:
df["Customer_id"].nunique()

1000

In [11]:
# Let us begin with dropping customer_id
df.drop("Customer_id", axis=1, inplace=True)
df.head()

Unnamed: 0,Brand_name,Screen_size,Battery_capacity,Ram_size,Storage_capacity,Price,Payment_type,Units_sold,Purchase_date_year,Purchase_date_month,Purchase_date_day
0,Apple,7.5,11250,12,512,791,paypal,3532,2023,9,23
1,Xiaomi,7.7,11550,12,512,792,credit card,3400,2023,6,13
2,Xiaomi,6.0,9000,9,32,66,cash,1638,2024,4,25
3,Samsung,5.5,8250,9,250,392,paypal,1176,2023,3,8
4,Oppo,4.9,7350,8,64,111,credit card,2802,2023,3,31


In [12]:
# We will keep Brand_name as target varaible.

X=df.drop("Brand_name" , axis=1)
X.head()

Unnamed: 0,Screen_size,Battery_capacity,Ram_size,Storage_capacity,Price,Payment_type,Units_sold,Purchase_date_year,Purchase_date_month,Purchase_date_day
0,7.5,11250,12,512,791,paypal,3532,2023,9,23
1,7.7,11550,12,512,792,credit card,3400,2023,6,13
2,6.0,9000,9,32,66,cash,1638,2024,4,25
3,5.5,8250,9,250,392,paypal,1176,2023,3,8
4,4.9,7350,8,64,111,credit card,2802,2023,3,31


In [13]:
X.dtypes

Screen_size            float64
Battery_capacity         int64
Ram_size                 int64
Storage_capacity         int64
Price                    int64
Payment_type            object
Units_sold               int64
Purchase_date_year       int32
Purchase_date_month      int32
Purchase_date_day        int32
dtype: object

In [14]:
from sklearn.preprocessing import OneHotEncoder
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()

#Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Apply one-hot encoding to the categorical columns
one_hot_encoded = encoder.fit_transform(df[categorical_columns])


#Create a DataFrame with the one-hot encoded columns
#We use get_feature_names_out() to get the column names for the encoded data
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenate the one-hot encoded dataframe with the original dataframe
df_encoded = pd.concat([X, one_hot_df], axis=1)

# Drop the original categorical columns
X = df_encoded.drop(categorical_columns, axis=1)
X.head()


Unnamed: 0,Screen_size,Battery_capacity,Ram_size,Storage_capacity,Price,Units_sold,Purchase_date_year,Purchase_date_month,Purchase_date_day,Payment_type_cash,Payment_type_credit card,Payment_type_debit card,Payment_type_paypal
0,7.5,11250,12,512,791,3532,2023,9,23,0.0,0.0,0.0,1.0
1,7.7,11550,12,512,792,3400,2023,6,13,0.0,1.0,0.0,0.0
2,6.0,9000,9,32,66,1638,2024,4,25,1.0,0.0,0.0,0.0
3,5.5,8250,9,250,392,1176,2023,3,8,0.0,0.0,0.0,1.0
4,4.9,7350,8,64,111,2802,2023,3,31,0.0,1.0,0.0,0.0


In [15]:
X.dtypes

Screen_size                 float64
Battery_capacity              int64
Ram_size                      int64
Storage_capacity              int64
Price                         int64
Units_sold                    int64
Purchase_date_year            int32
Purchase_date_month           int32
Purchase_date_day             int32
Payment_type_cash           float64
Payment_type_credit card    float64
Payment_type_debit card     float64
Payment_type_paypal         float64
dtype: object

In [16]:
y=df["Brand_name"]
y.head()

0      Apple
1     Xiaomi
2     Xiaomi
3    Samsung
4       Oppo
Name: Brand_name, dtype: object

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25, random_state=20)

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier

In [20]:
y.head()

0      Apple
1     Xiaomi
2     Xiaomi
3    Samsung
4       Oppo
Name: Brand_name, dtype: object

In [21]:
# Creating AdaBoost classifier model
adb = AdaBoostClassifier()
adb_model = adb.fit(X_train, y_train)



In [23]:
# Evaluating the model on the validation set
accuracy = adb_model.score(X_test, y_test)
print("The accuracy of the model on validation set is", accuracy)

The accuracy of the model on validation set is 0.172
