In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("myntra_dataset.csv")

In [3]:
df.head(10)

Unnamed: 0,brand_name,pants_description,price,MRP,discount_percent,ratings,number_of_ratings
0,WROGN,Men Loose Fit Cotton Jeans,1374.0,2499.0,0.45,4.2,57.0
1,Flying Machine,Men Slim Fit Jeans,1829.0,2999.0,0.39,4.6,5.0
2,Roadster,Men Pure Cotton Jeans,974.0,2499.0,0.61,3.6,1100.0
3,Bene Kleed,Relaxed Fit Denim Jeans,873.0,2299.0,0.62,4.0,4800.0
4,Levis,Men 511 Slim Fit Jeans,1478.0,2899.0,0.49,4.3,264.0
5,HERE&NOW,Men Stretchable Jeans,798.0,1699.0,0.53,4.0,33.0
6,Urbano Fashion,Men Relaxed Fit Jeans,944.0,2099.0,0.55,4.0,4200.0
7,WROGN,Men Anti Fit Jeans,1623.0,2799.0,0.42,4.2,42.0
8,Bene Kleed,Men Wide Leg Heavy Fade Jeans,839.0,2799.0,0.7,3.6,114.0
9,Bene Kleed,Relaxed Fit Denim Jeans,873.0,2299.0,0.62,4.1,5200.0


In [4]:
df.shape

(52120, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52120 entries, 0 to 52119
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   brand_name         52120 non-null  object 
 1   pants_description  52120 non-null  object 
 2   price              52120 non-null  float64
 3   MRP                52120 non-null  float64
 4   discount_percent   52120 non-null  float64
 5   ratings            52120 non-null  float64
 6   number_of_ratings  52120 non-null  float64
dtypes: float64(5), object(2)
memory usage: 2.8+ MB


In [6]:
df.describe()

Unnamed: 0,price,MRP,discount_percent,ratings,number_of_ratings
count,52120.0,52120.0,52120.0,52120.0,52120.0
mean,1594.515445,3180.398438,1.648256,3.997794,91.568937
std,1495.972325,2201.883218,4.687529,0.420404,433.918513
min,337.0,499.0,0.02,1.0,5.0
25%,989.0,2499.0,0.4,3.8,16.0
50%,1439.0,2999.0,0.5,4.0,35.0
75%,1829.0,3499.0,0.63,4.2,74.0
max,54000.0,72000.0,64.0,5.0,30700.0


In [8]:
df.isnull().sum()

brand_name           0
pants_description    0
price                0
MRP                  0
discount_percent     0
ratings              0
number_of_ratings    0
dtype: int64

In [10]:
df.dtypes

brand_name            object
pants_description     object
price                float64
MRP                  float64
discount_percent     float64
ratings              float64
number_of_ratings    float64
dtype: object

In [11]:
import numpy as np

# Creating a few columns for further analysis
- **Effective Discount Percent** <br>
      this will derive discount and demand relationship
- **Rating Weighted** <br>
      helps identify trustworthy and high performing products
- **Value for Money Score** <br>
      This will be useful for pricing optimization and competitive positioning
- **Popularity Index** <br>
      this will showcase overall product demand

In [12]:
# Creating column "effective_discount_percent"

df['effective_discount_percent'] = (
    (df['MRP'] - df['price']) / df['MRP']
) * 100

In [14]:
df.columns

Index(['brand_name', 'pants_description', 'price', 'MRP', 'discount_percent',
       'ratings', 'number_of_ratings', 'effective_discount_percent'],
      dtype='object')

In [15]:
# Creating column "rating_weighted"

df['rating_weighted'] = (
    df['ratings'] * np.log(df['number_of_ratings'] + 1)
)

In [16]:
df.columns

Index(['brand_name', 'pants_description', 'price', 'MRP', 'discount_percent',
       'ratings', 'number_of_ratings', 'effective_discount_percent',
       'rating_weighted'],
      dtype='object')

In [17]:
# Creating column "value_for_money_score"

df['value_for_money_score'] = (
    df['ratings'] / (df['price'] / 1000)
)

In [18]:
df.columns

Index(['brand_name', 'pants_description', 'price', 'MRP', 'discount_percent',
       'ratings', 'number_of_ratings', 'effective_discount_percent',
       'rating_weighted', 'value_for_money_score'],
      dtype='object')

In [19]:
# Creating column "popularity_index"

df['popularity_index'] = (
    df['rating_weighted'] +
    df['effective_discount_percent']
)

In [20]:
df.columns

Index(['brand_name', 'pants_description', 'price', 'MRP', 'discount_percent',
       'ratings', 'number_of_ratings', 'effective_discount_percent',
       'rating_weighted', 'value_for_money_score', 'popularity_index'],
      dtype='object')

In [21]:
df.head()

Unnamed: 0,brand_name,pants_description,price,MRP,discount_percent,ratings,number_of_ratings,effective_discount_percent,rating_weighted,value_for_money_score,popularity_index
0,WROGN,Men Loose Fit Cotton Jeans,1374.0,2499.0,0.45,4.2,57.0,45.018007,17.053861,3.056769,62.071868
1,Flying Machine,Men Slim Fit Jeans,1829.0,2999.0,0.39,4.6,5.0,39.013004,8.242094,2.515036,47.255098
2,Roadster,Men Pure Cotton Jeans,974.0,2499.0,0.61,3.6,1100.0,61.02441,25.214307,3.696099,86.238717
3,Bene Kleed,Relaxed Fit Denim Jeans,873.0,2299.0,0.62,4.0,4800.0,62.026968,33.906318,4.581901,95.933286
4,Levis,Men 511 Slim Fit Jeans,1478.0,2899.0,0.49,4.3,264.0,49.016902,23.992838,2.909337,73.009741


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52120 entries, 0 to 52119
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   brand_name                  52120 non-null  object 
 1   pants_description           52120 non-null  object 
 2   price                       52120 non-null  float64
 3   MRP                         52120 non-null  float64
 4   discount_percent            52120 non-null  float64
 5   ratings                     52120 non-null  float64
 6   number_of_ratings           52120 non-null  float64
 7   effective_discount_percent  52120 non-null  float64
 8   rating_weighted             52120 non-null  float64
 9   value_for_money_score       52120 non-null  float64
 10  popularity_index            52120 non-null  float64
dtypes: float64(9), object(2)
memory usage: 4.4+ MB


In [23]:
df.describe()

Unnamed: 0,price,MRP,discount_percent,ratings,number_of_ratings,effective_discount_percent,rating_weighted,value_for_money_score,popularity_index
count,52120.0,52120.0,52120.0,52120.0,52120.0,52120.0,52120.0,52120.0,52120.0
mean,1594.515445,3180.398438,1.648256,3.997794,91.568937,50.555769,14.498543,3.333369,65.054311
std,1495.972325,2201.883218,4.687529,0.420404,433.918513,14.780993,4.927046,1.899091,15.383491
min,337.0,499.0,0.02,1.0,5.0,2.000488,1.791759,0.075362,7.83324
25%,989.0,2499.0,0.4,3.8,16.0,40.011432,10.766211,2.186987,56.579017
50%,1439.0,2999.0,0.5,4.0,35.0,50.017864,13.689138,2.73187,64.536892
75%,1829.0,3499.0,0.63,4.2,74.0,61.016491,17.459905,3.850932,76.208183
max,54000.0,72000.0,64.0,5.0,30700.0,86.135038,40.049547,12.698413,113.551152


In [24]:
#Save the file for further analysis

df.to_csv('myntra_menswear.csv', index=False) 