In [72]:
#Importing all required libraries
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [73]:
# Loading the data from the CSV file fakes.csv
data_path = 'fakes/data_fakes.csv'
data = pd.read_csv(data_path)

# Displaying the first few rows to see what the data looks like
data.head(), data.dtypes

(   Unnamed: 0  RefId   PurchDate  VNZIP1       VNST  MarketDate  ProductAge  \
 0           0  52641  2020-05-05   75731  Amsterdam      2017.0         5.0   
 1           1  63946  2019-19-08   62829       Rome      2015.0         7.0   
 2           2  55565  2020-18-05   10604    Hamburg      2020.0         2.0   
 3           3  64700  2020-22-12   62829       Rome      2016.0         6.0   
 4           4  33335  2019-01-02   80446     Madrid      2015.0         7.0   
 
    AveragePrice  TransactionPrice  ConfirmedPrice  \
 0         68.18             71.45           81.11   
 1         38.52             58.55           42.88   
 2         91.56             92.20           97.40   
 3        109.32             98.35          129.04   
 4         32.89             50.55           45.93   
 
                                          ProductName  \
 0  Urparcel Pet Puppy Dog Cat Sleeping Bed Cushio...   
 1  Heavy Duty Trash Compactor Bags - 60 Count - F...   
 2                   

In [78]:
#Covering multiple date formats
def parse_dates(date):
    for fmt in ('%Y-%d-%m', '%m/%d/%y'):  # adjusting formats as found the data inspection
        try:
            return pd.to_datetime(date, format=fmt)
        except ValueError:
            continue
    return pd.NaT 

data['PurchDate'] = data['PurchDate'].apply(parse_dates)



In [79]:
#Checking invalid dates 
invalid_dates = data[data['PurchDate'].isna()]
print(invalid_dates) 

Empty DataFrame
Columns: [Unnamed: 0, RefId, PurchDate, VNZIP1, VNST, MarketDate, ProductAge, AveragePrice, TransactionPrice, ConfirmedPrice, ProductName, Category, Brand, FullfillmentType, DeliveryCategory, Fake]
Index: []


In [81]:
#Converting Purchase Date from string object to date Time 
data['PurchDate'] = pd.to_datetime(data['PurchDate'], errors='coerce')
data.head()

Unnamed: 0.1,Unnamed: 0,RefId,PurchDate,VNZIP1,VNST,MarketDate,ProductAge,AveragePrice,TransactionPrice,ConfirmedPrice,ProductName,Category,Brand,FullfillmentType,DeliveryCategory,Fake
0,0,52641,2020-05-05,75731,Amsterdam,2017.0,5.0,68.18,71.45,81.11,Urparcel Pet Puppy Dog Cat Sleeping Bed Cushio...,Pet Supplies Cats,Urparcel,MARKETPLACE,DIRECT,0
1,1,63946,2019-08-19,62829,Rome,2015.0,7.0,38.52,58.55,42.88,Heavy Duty Trash Compactor Bags - 60 Count - F...,Appliances (See top 100),FKE,BUSINESS,PRIME,0
2,2,55565,2020-05-18,10604,Hamburg,2020.0,2.0,91.56,92.2,97.4,"Toppik Fiber Hold Spray, 4 oz",Beauty &amp; Personal Care,,CENTRAL,DIRECT,0
3,3,64700,2020-12-22,62829,Rome,2016.0,6.0,109.32,98.35,129.04,"Holmes &quot;C&quot; Humidifier Filter, HWF65P...",Home & Kitchen (See,Holmes,BUSINESS,PRIME,0
4,4,33335,2019-02-01,80446,Madrid,2015.0,7.0,32.89,50.55,45.93,Finest Solid Wood Cherry Elevated Dog Feeder -...,Pet Supplies Dogs,Mongoora,MARKETPLACE,DIRECT,0


In [82]:
#Dropping unnecessary columns(Unnamed and RefId)
print(data.columns)
data.drop(columns=['Unnamed: 0','RefId'], inplace=True)
data.columns

Index(['Unnamed: 0', 'RefId', 'PurchDate', 'VNZIP1', 'VNST', 'MarketDate',
       'ProductAge', 'AveragePrice', 'TransactionPrice', 'ConfirmedPrice',
       'ProductName', 'Category', 'Brand', 'FullfillmentType',
       'DeliveryCategory', 'Fake'],
      dtype='object')


Index(['PurchDate', 'VNZIP1', 'VNST', 'MarketDate', 'ProductAge',
       'AveragePrice', 'TransactionPrice', 'ConfirmedPrice', 'ProductName',
       'Category', 'Brand', 'FullfillmentType', 'DeliveryCategory', 'Fake'],
      dtype='object')

In [83]:
# Handling missing values, removing na's
data.dropna(subset=['MarketDate'], inplace=True)
data.head()

Unnamed: 0,PurchDate,VNZIP1,VNST,MarketDate,ProductAge,AveragePrice,TransactionPrice,ConfirmedPrice,ProductName,Category,Brand,FullfillmentType,DeliveryCategory,Fake
0,2020-05-05,75731,Amsterdam,2017.0,5.0,68.18,71.45,81.11,Urparcel Pet Puppy Dog Cat Sleeping Bed Cushio...,Pet Supplies Cats,Urparcel,MARKETPLACE,DIRECT,0
1,2019-08-19,62829,Rome,2015.0,7.0,38.52,58.55,42.88,Heavy Duty Trash Compactor Bags - 60 Count - F...,Appliances (See top 100),FKE,BUSINESS,PRIME,0
2,2020-05-18,10604,Hamburg,2020.0,2.0,91.56,92.2,97.4,"Toppik Fiber Hold Spray, 4 oz",Beauty &amp; Personal Care,,CENTRAL,DIRECT,0
3,2020-12-22,62829,Rome,2016.0,6.0,109.32,98.35,129.04,"Holmes &quot;C&quot; Humidifier Filter, HWF65P...",Home & Kitchen (See,Holmes,BUSINESS,PRIME,0
4,2019-02-01,80446,Madrid,2015.0,7.0,32.89,50.55,45.93,Finest Solid Wood Cherry Elevated Dog Feeder -...,Pet Supplies Dogs,Mongoora,MARKETPLACE,DIRECT,0


In [84]:
#Removing NaT dates for now
data = data.dropna(subset=['PurchDate'])
data.head()

Unnamed: 0,PurchDate,VNZIP1,VNST,MarketDate,ProductAge,AveragePrice,TransactionPrice,ConfirmedPrice,ProductName,Category,Brand,FullfillmentType,DeliveryCategory,Fake
0,2020-05-05,75731,Amsterdam,2017.0,5.0,68.18,71.45,81.11,Urparcel Pet Puppy Dog Cat Sleeping Bed Cushio...,Pet Supplies Cats,Urparcel,MARKETPLACE,DIRECT,0
1,2019-08-19,62829,Rome,2015.0,7.0,38.52,58.55,42.88,Heavy Duty Trash Compactor Bags - 60 Count - F...,Appliances (See top 100),FKE,BUSINESS,PRIME,0
2,2020-05-18,10604,Hamburg,2020.0,2.0,91.56,92.2,97.4,"Toppik Fiber Hold Spray, 4 oz",Beauty &amp; Personal Care,,CENTRAL,DIRECT,0
3,2020-12-22,62829,Rome,2016.0,6.0,109.32,98.35,129.04,"Holmes &quot;C&quot; Humidifier Filter, HWF65P...",Home & Kitchen (See,Holmes,BUSINESS,PRIME,0
4,2019-02-01,80446,Madrid,2015.0,7.0,32.89,50.55,45.93,Finest Solid Wood Cherry Elevated Dog Feeder -...,Pet Supplies Dogs,Mongoora,MARKETPLACE,DIRECT,0


In [85]:
# Creating nre feature as there is 2 types of prices available
data['PriceDiscrepancy'] = data['TransactionPrice'] - data['AveragePrice']

In [86]:
# Preparing features(X) and labels(y)
X = data.drop('Fake', axis=1)
y = data['Fake']



In [87]:
# Identifing categorical and numerical columns
categorical = X.select_dtypes(include=['object', 'category']).columns
numerical = X.select_dtypes(include=['int64', 'float64']).columns

categorical, numerical

(Index(['VNST', 'ProductName', 'Category', 'Brand', 'FullfillmentType',
        'DeliveryCategory'],
       dtype='object'),
 Index(['VNZIP1', 'MarketDate', 'ProductAge', 'AveragePrice',
        'TransactionPrice', 'ConfirmedPrice', 'PriceDiscrepancy'],
       dtype='object'))

In [88]:
# Preprocessing 
numerical_t = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_t = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Creating the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_t, numerical),
        ('cat', categorical_t, categorical)
    ])

In [89]:
#Defining a model
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [90]:
# Creating a simple modeling pippeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

# Spliting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training the model
clf.fit(X_train, y_train)

# Make predictions 
y_pred = clf.predict(X_test)

y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [None]:
#Model Evaluation
print(classification_report(y_test, y_pred))

# Perform cross-validation
scores = cross_val_score(clf, X, y, cv=5)
print("Average cross-validation score: {:.2f}".format(np.mean(scores)))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94     15354
           1       0.78      0.22      0.35      2162

    accuracy                           0.90     17516
   macro avg       0.84      0.61      0.65     17516
weighted avg       0.89      0.90      0.87     17516

