In [1]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('flipkart_com-ecommerce_sample_small.csv')

# Drop rows with missing values
df.dropna(inplace=True)

# Fill missing values with the mean for numeric columns only
numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Remove duplicate entries
df.drop_duplicates(inplace=True)

print("Data cleaned successfully.")


Data cleaned successfully.


In [3]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
df = pd.read_csv('flipkart_com-ecommerce_sample.csv')
# Normalize the 'retail_price' and 'discounted_price' columns to a 0-1 range
scaler = MinMaxScaler()
df[['retail_price', 'discounted_price']] = scaler.fit_transform(df[['retail_price', 'discounted_price']])

print("Data normalized.")

Data normalized.


In [5]:
# Apply One-Hot Encoding to the 'Category' column
df = pd.get_dummies(df, columns=['product_category_tree'], drop_first=True)

print("Categorical variables encoded.")

Categorical variables encoded.


In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Corrected Example DataFrame with equal-length lists
data = {
    'Product': ['Laptop', 'Shirt', 'Mobile', 'Shoes', 'Tablet', 'Women Clothes', 'Man Clothes'],
    'Category': ['Electronics', 'Fashion', 'Electronics', 'Fashion', 'Electronics', 'Fashion', 'Fashion'],
    'Rating': ['High', 'Medium', 'Low', 'High', 'Medium', 'Low', 'High']
}

df = pd.DataFrame(data)

# One-Hot Encoding for 'Category' column
df = pd.get_dummies(df, columns=['Category'], drop_first=True)
print("One-Hot Encoding applied to 'Category' column:\n", df)

# Label Encoding for 'Rating' column
label_encoder = LabelEncoder()
df['Rating'] = label_encoder.fit_transform(df['Rating'])
print("\nLabel Encoding applied to 'Rating' column:\n", df)



One-Hot Encoding applied to 'Category' column:
          Product  Rating  Category_Fashion
0         Laptop    High             False
1          Shirt  Medium              True
2         Mobile     Low             False
3          Shoes    High              True
4         Tablet  Medium             False
5  Women Clothes     Low              True
6    Man Clothes    High              True

Label Encoding applied to 'Rating' column:
          Product  Rating  Category_Fashion
0         Laptop       0             False
1          Shirt       2              True
2         Mobile       1             False
3          Shoes       0              True
4         Tablet       2             False
5  Women Clothes       1              True
6    Man Clothes       0              True


In [15]:
import pandas as pd

df = pd.read_csv('flipkart_com-ecommerce_sample.csv')
print(df.columns)


Index(['uniq_id', 'crawl_timestamp', 'product_url', 'product_name',
       'product_category_tree', 'pid', 'retail_price', 'discounted_price',
       'image', 'is_FK_Advantage_product', 'description', 'product_rating',
       'overall_rating', 'brand', 'product_specifications'],
      dtype='object')


In [3]:
import pandas as pd

df = pd.read_csv('flipkart_com-ecommerce_sample.csv')
#Change the Rows Name
features = df[['retail_price', 'overall_rating', 'product_category_tree', 'is_FK_Advantage_product']]  #Haypitatis Name

print("Selected features:\n", features.head())


Selected features:
    retail_price       overall_rating  \
0         999.0  No rating available   
1       32157.0  No rating available   
2         999.0  No rating available   
3         699.0  No rating available   
4         220.0  No rating available   

                               product_category_tree  is_FK_Advantage_product  
0  ["Clothing >> Women's Clothing >> Lingerie, Sl...                    False  
1  ["Furniture >> Living Room Furniture >> Sofa B...                    False  
2  ["Footwear >> Women's Footwear >> Ballerinas >...                    False  
3  ["Clothing >> Women's Clothing >> Lingerie, Sl...                    False  
4  ["Pet Supplies >> Grooming >> Skin & Coat Care...                    False  


In [17]:
import pandas as pd

# Hypothetical dataset
data = {
    'Product': ['Laptop', 'Shirt', 'Mobile', 'Shoes', 'Tablet', 'Women Clothe', 'Man Clothes'],
    'Category': ['Electronics', 'Category_Fashion', 'Electronics', 'Category_Fashion', 'Electronics', 'Category_Fashion', 'Category_Fashion'],
    'Price': [800, 30, 600, 50, 300, 70, 45],
    'Rating': ['High', 'Medium', 'Low', 'High', 'Medium', 'Low', 'High'],
    'Customer_Age': [34, 22, 40, 29, 24, 30, 35],
    'Category_Fashion' : ['Shirt', 'Shoes', 'Women Clothes', 'Man Clothes', 'Kid clothing', 'girls Clothing', 'Boy shirt' ],
    'Purchased': [1, 0, 1, 0, 1, 1, 0],
    'Purchase_Frequency': [3, 1, 5, 1, 2, 2, 1],
    'Region': ['KBL', 'GH', 'KND', 'MAZ', 'BAD', 'HRT', 'BMY']
}

df = pd.DataFrame(data)

# Feature Engineering

# Creating Price_Bucket feature
df['Price_Bucket'] = pd.cut(df['Price'], bins=[0, 100, 500, 1000], labels=['Low', 'Medium', 'High'])

# Creating Age_Group feature
df['Age_Group'] = pd.cut(df['Customer_Age'], bins=[0, 25, 40, 100], labels=['Youth', 'Adult', 'Senior'])

# Creating High_Rated feature
df['High_Rated'] = df['Rating'].apply(lambda x: 1 if x == 'High' else 0)

print("After Feature Engineering:\n", df[['Product', 'Price', 'Price_Bucket', 'Customer_Age', 'Age_Group', 'Rating', 'High_Rated']])


After Feature Engineering:
         Product  Price Price_Bucket  Customer_Age Age_Group  Rating  \
0        Laptop    800         High            34     Adult    High   
1         Shirt     30          Low            22     Youth  Medium   
2        Mobile    600         High            40     Adult     Low   
3         Shoes     50          Low            29     Adult    High   
4        Tablet    300       Medium            24     Youth  Medium   
5  Women Clothe     70          Low            30     Adult     Low   
6   Man Clothes     45          Low            35     Adult    High   

   High_Rated  
0           1  
1           0  
2           0  
3           1  
4           0  
5           0  
6           1  


In [19]:
from sklearn.model_selection import train_test_split

# Features and target variable
X = df[['Price', 'High_Rated', 'Customer_Age', 'Purchase_Frequency', 'Category_Fashion']]  # Selected features
y = df['Purchased']  # Target variable indicating purchase behavior

# Split the data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (4, 5)
Testing set size: (3, 5)


In [21]:
# Check the distribution of the target variable to ensure balance
print("Purchase distribution in training set:\n", y_train.value_counts(normalize=True))

# Check for sufficient data points in each category
print("\nFeature summary in training set:")
print(X_train.describe())

# Confirm data cleaning
print("\nNull values in training set:\n", X_train.isnull().sum())


Purchase distribution in training set:
 Purchased
1    0.5
0    0.5
Name: proportion, dtype: float64

Feature summary in training set:
            Price  High_Rated  Customer_Age  Purchase_Frequency
count    4.000000     4.00000       4.00000            4.000000
mean   248.750000     0.50000      32.00000            2.250000
std    262.690407     0.57735       6.97615            1.892969
min     45.000000     0.00000      24.00000            1.000000
25%     48.750000     0.00000      27.75000            1.000000
50%    175.000000     0.50000      32.00000            1.500000
75%    375.000000     1.00000      36.25000            2.750000
max    600.000000     1.00000      40.00000            5.000000

Null values in training set:
 Price                 0
High_Rated            0
Customer_Age          0
Purchase_Frequency    0
Category_Fashion      0
dtype: int64


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Define the dataset
data = {
    'Product': ['Laptop', 'Shirt', 'Mobile', 'Shoes', 'Tablet', 'Women Clothes', 'Man Clothes'],
    'Category': ['Electronics', 'Fashion', 'Electronics', 'Fashion', 'Electronics', 'Fashion', 'Fashion'],
    'Price': [800, 30, 600, 50, 300, 70, 45],
    'Rating': ['High', 'Medium', 'Low', 'High', 'Medium', 'Low', 'High'],
    'Customer_Age': [34, 22, 40, 29, 24, 30, 35],
    'Purchased': [1, 0, 1, 0, 1, 1, 0],
    'Purchase_Frequency': [3, 1, 5, 1, 2, 2, 1],
    'Region': ['KBL', 'GH', 'KND', 'MAZ', 'BAD', 'HRT', 'BMY']
}

df = pd.DataFrame(data)

# Feature Engineering
df['Price_Bucket'] = pd.cut(df['Price'], bins=[0, 100, 500, 1000], labels=['Low', 'Medium', 'High'])
df['Age_Group'] = pd.cut(df['Customer_Age'], bins=[0, 25, 40, 100], labels=['Youth', 'Adult', 'Senior'])
df['High_Rated'] = df['Rating'].apply(lambda x: 1 if x == 'High' else 0)

# One-Hot Encoding for 'Category'
df = pd.get_dummies(df, columns=['Category'], drop_first=True)

# Select features and target
X = df[['Price', 'High_Rated', 'Customer_Age', 'Purchase_Frequency', 'Category_Fashion']]
y = df['Purchased']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Model Accuracy: 0.6666666666666666

Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



In [17]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model using the training data
model.fit(X_train, y_train)

print("Model training completed.")


Model training completed.


In [19]:
# Predict on the test set
y_pred = model.predict(X_test)

print("Predictions completed.")


Predictions completed.


In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)


Model Accuracy: 0.67
Confusion Matrix:
 [[1 0]
 [1 1]]


In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

# Sample dataset
data = {
    'Product': ['Laptop', 'Shirt', 'Mobile', 'Shoes', 'Tablet', 'Women Clothes', 'Man Clothes'],
    'Category': ['Electronics', 'Fashion', 'Electronics', 'Fashion', 'Electronics', 'Fashion', 'Fashion'],
    'Price': [800, 30, 600, 50, 300, 70, 45],
    'Rating': ['High', 'Medium', 'Low', 'High', 'Medium', 'Low', 'High'],
    'Customer_Age': [34, 22, 40, 29, 24, 30, 35],
    'Purchased': [1, 0, 1, 0, 1, 1, 0],
    'Purchase_Frequency': [3, 1, 5, 1, 2, 2, 1]
}

df = pd.DataFrame(data)

# Feature Engineering
df['High_Rated'] = df['Rating'].apply(lambda x: 1 if x == 'High' else 0)
df = pd.get_dummies(df, columns=['Category'], drop_first=True)

# Features and target
X = df[['Price', 'High_Rated', 'Customer_Age', 'Purchase_Frequency', 'Category_Fashion']]
y = df['Purchased']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'trained_model.pkl')
print("Model trained and saved as 'trained_model.pkl'.")

Model trained and saved as 'trained_model.pkl'.


In [5]:
# from flask import Flask, request, jsonify
# import joblib

# # Initialize Flask app
# app = Flask(__name__)

# # Load the trained model
# model = joblib.load('trained_model.pkl')

# @app.route('/predict', methods=['POST'])
# def predict():
#     # Example input: {"Price": 300, "High_Rated": 1, "Customer_Age": 24, "Purchase_Frequency": 2, "Category_Fashion": 1}
#     data = request.get_json()
#     features = [data['Price'], data['High_Rated'], data['Customer_Age'], data['Purchase_Frequency'], data['Category_Fashion']]
#     prediction = model.predict([features])
#     return jsonify({'Purchased': int(prediction[0])})

# if __name__ == '__main__':
#     app.run(debug=True)


In [7]:
# {
#   "Price": 300,
#   "High_Rated": 1,
#   "Customer_Age": 24,
#   "Purchase_Frequency": 2,
#   "Category_Fashion": 1
# }


In [9]:
# {
#   "Purchased": 1
# }
