##### Note : Read the iphone.csv dataset to pandas dataframe

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('iphone.csv')
df

Unnamed: 0,Product Name,Product URL,Brand,Sale Price,Mrp,Number Of Ratings,Number Of Reviews,Upc,Star Rating,Ram
0,"APPLE iPhone 8 Plus (Gold, 64 GB)",https://www.flipkart.com/apple-iphone-8-plus-g...,Apple,49900,49900,3431,356,MOBEXRGV7EHHTGUH,4.6,2 GB
1,"APPLE iPhone 8 Plus (Space Grey, 256 GB)",https://www.flipkart.com/apple-iphone-8-plus-s...,Apple,84900,84900,3431,356,MOBEXRGVAC6TJT4F,4.6,2 GB
2,"APPLE iPhone 8 Plus (Silver, 256 GB)",https://www.flipkart.com/apple-iphone-8-plus-s...,Apple,84900,84900,3431,356,MOBEXRGVGETABXWZ,4.6,2 GB
3,"APPLE iPhone 8 (Silver, 256 GB)",https://www.flipkart.com/apple-iphone-8-silver...,Apple,77000,77000,11202,794,MOBEXRGVMZWUHCBA,,2 GB
4,"APPLE iPhone 8 (Gold, 256 GB)",https://www.flipkart.com/apple-iphone-8-gold-2...,Apple,77000,77000,11202,794,MOBEXRGVPK7PFEJZ,4.5,2 GB
...,...,...,...,...,...,...,...,...,...,...
57,"APPLE iPhone SE (Black, 64 GB)",https://www.flipkart.com/apple-iphone-se-black...,Apple,29999,39900,95909,8161,MOBFWQ6BR3MK7AUG,4.5,4 GB
58,"APPLE iPhone 11 (Purple, 64 GB)",https://www.flipkart.com/apple-iphone-11-purpl...,Apple,46999,54900,43470,3331,MOBFWQ6BTFFJKGKE,4.6,4 GB
59,"APPLE iPhone 11 (White, 64 GB)",https://www.flipkart.com/apple-iphone-11-white...,Apple,46999,54900,43470,3331,MOBFWQ6BVWVEH3XE,4.6,4 GB
60,"APPLE iPhone 11 (Black, 64 GB)",https://www.flipkart.com/apple-iphone-11-black...,Apple,46999,54900,43470,3331,MOBFWQ6BXGJCEYNY,4.6,4 GB


#

#### 1. The column names have spaces . rename the column names to have underscore '_' instead of space (try to do in one go instead of specifying each column nam in rename method)

In [3]:
df.columns = [col_name.replace(' ', '_') for col_name in df.columns]

# df.columns = df.columns.str.replace(' ', '_')

In [4]:
df.columns

Index(['Product_Name', 'Product_URL', 'Brand', 'Sale_Price', 'Mrp',
       'Number_Of_Ratings', 'Number_Of_Reviews', 'Upc', 'Star_Rating', 'Ram'],
      dtype='object')

#

#### 2. star rating for some of the models is missing in the dataset. fill those missing values with the average rating all the models.

In [5]:
df['Star_Rating'].fillna(value=df['Star_Rating'].mean())

0     4.600000
1     4.600000
2     4.600000
3     4.576271
4     4.500000
        ...   
57    4.500000
58    4.600000
59    4.600000
60    4.600000
61    4.600000
Name: Star_Rating, Length: 62, dtype: float64

#

#### 3. Now instead of filling missing values with avg rating of full dataset , fill with avg rating based on RAM. example :  if rating for a 2 gb phone is missing then take average of all other 2 gb phones rating and fill that value. 

In [6]:
# Calculate the average rating for each RAM group
average_ratings_by_ram = df.groupby('Ram')['Star_Rating'].mean().reset_index()

In [7]:
# Rename the 'Star Rating' column to 'Average Rating' for clarity
average_ratings_by_ram.rename(columns={'Star_Rating': 'Average_Rating'}, inplace=True)

In [8]:
average_ratings_by_ram

Unnamed: 0,Ram,Average_Rating
0,2 GB,4.541667
1,3 GB,4.6
2,4 GB,4.589286
3,6 GB,4.577778


In [9]:
# Perform a left join to fill missing ratings based on RAM
df = df.merge(average_ratings_by_ram, on='Ram', how='left')

In [10]:
# Fill missing ratings with the calculated average ratings
df['Star_Rating'].fillna(df['Average_Rating'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Star_Rating'].fillna(df['Average_Rating'], inplace=True)


In [11]:
# Drop the 'Average Rating' column if not needed
df.drop(columns=['Average_Rating'], inplace=True)

In [12]:
# no NaN values
df[df['Star_Rating'].isna()]

Unnamed: 0,Product_Name,Product_URL,Brand,Sale_Price,Mrp,Number_Of_Ratings,Number_Of_Reviews,Upc,Star_Rating,Ram


#

#### 4. create a new column in the dataframe "Discount_Percentage" based on MRP and sale value

In [13]:
df['Discount_Percentage'] = ((df['Mrp'] - df['Sale_Price'])/df['Mrp']) * 100

In [14]:
df

Unnamed: 0,Product_Name,Product_URL,Brand,Sale_Price,Mrp,Number_Of_Ratings,Number_Of_Reviews,Upc,Star_Rating,Ram,Discount_Percentage
0,"APPLE iPhone 8 Plus (Gold, 64 GB)",https://www.flipkart.com/apple-iphone-8-plus-g...,Apple,49900,49900,3431,356,MOBEXRGV7EHHTGUH,4.600000,2 GB,0.000000
1,"APPLE iPhone 8 Plus (Space Grey, 256 GB)",https://www.flipkart.com/apple-iphone-8-plus-s...,Apple,84900,84900,3431,356,MOBEXRGVAC6TJT4F,4.600000,2 GB,0.000000
2,"APPLE iPhone 8 Plus (Silver, 256 GB)",https://www.flipkart.com/apple-iphone-8-plus-s...,Apple,84900,84900,3431,356,MOBEXRGVGETABXWZ,4.600000,2 GB,0.000000
3,"APPLE iPhone 8 (Silver, 256 GB)",https://www.flipkart.com/apple-iphone-8-silver...,Apple,77000,77000,11202,794,MOBEXRGVMZWUHCBA,4.541667,2 GB,0.000000
4,"APPLE iPhone 8 (Gold, 256 GB)",https://www.flipkart.com/apple-iphone-8-gold-2...,Apple,77000,77000,11202,794,MOBEXRGVPK7PFEJZ,4.500000,2 GB,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
57,"APPLE iPhone SE (Black, 64 GB)",https://www.flipkart.com/apple-iphone-se-black...,Apple,29999,39900,95909,8161,MOBFWQ6BR3MK7AUG,4.500000,4 GB,24.814536
58,"APPLE iPhone 11 (Purple, 64 GB)",https://www.flipkart.com/apple-iphone-11-purpl...,Apple,46999,54900,43470,3331,MOBFWQ6BTFFJKGKE,4.600000,4 GB,14.391621
59,"APPLE iPhone 11 (White, 64 GB)",https://www.flipkart.com/apple-iphone-11-white...,Apple,46999,54900,43470,3331,MOBFWQ6BVWVEH3XE,4.600000,4 GB,14.391621
60,"APPLE iPhone 11 (Black, 64 GB)",https://www.flipkart.com/apple-iphone-11-black...,Apple,46999,54900,43470,3331,MOBFWQ6BXGJCEYNY,4.600000,4 GB,14.391621


#

#### 5. which model has highest percent discount ?

In [15]:
# solution 1 
df[df['Discount_Percentage'] == df['Discount_Percentage'].max()][['Product_Name','Discount_Percentage']]

Unnamed: 0,Product_Name,Discount_Percentage
18,"APPLE iPhone 11 Pro (Midnight Green, 64 GB)",29.644465


In [16]:
# solution 2 using sort
sorted_df = df.sort_values(by='Discount_Percentage', ascending=False)

# Get the model with the highest discount and its corresponding percentage
model_with_highest_discount = sorted_df.iloc[0]['Product_Name']
highest_discount_percentage = sorted_df.iloc[0]['Discount_Percentage']

print('model_with_highest_discount:', model_with_highest_discount)
print('highest_discount_percentage:', highest_discount_percentage)

model_with_highest_discount: APPLE iPhone 11 Pro (Midnight Green, 64 GB)
highest_discount_percentage: 29.644465290806753


#

#### 6. find total no of models  each space configuration (128 GB , 64 GB etc)

In [87]:
# create column with storage space
df['Storage_space'] = [name[-7:-1].strip() for name in df['Product_Name'].to_list()]

In [88]:
#  find total number of products for each space configuration
df.groupby('Storage_space')["Product_Name"].count()

Storage_space
128 GB    24
256 GB    14
512 GB     4
64 GB     20
Name: Product_Name, dtype: int64

In [89]:
 # Drop the 'Storage_name' column if not needed
# df.drop(columns=['Storage_space'], inplace=True)

#

#### 7. find total number of models for each color 

In [23]:
# Define a function to extract color based on a pattern
def extract_color(product_name):
    # Split the product name by '(' and ')' to get parts within parentheses
    product_split = product_name.split('(')[1]

    if len(product_split) > 1:
        # Check if the second part contains a comma (,) to identify color
        color = product_split.split(',')[0].strip()
        return color
    return None
    

In [28]:
# Apply the extract_color function to create a 'Color' column
df['Color'] = df['Product_Name'].apply(extract_color)

In [29]:
# Count the number of models for each color
df.groupby("Color")['Product_Name'].count()

Color
Black             10
Blue               2
Coral              1
Gold               6
Graphite           4
Green              1
Midnight Green     4
Pacific Blue       4
Purple             1
RED                1
Red                5
Silver             7
Space Grey         6
White             10
Name: Product_Name, dtype: int64

In [30]:
# Count the number of models for each color
color_counts = df['Color'].value_counts().reset_index()
color_counts

Unnamed: 0,Color,count
0,Black,10
1,White,10
2,Silver,7
3,Space Grey,6
4,Gold,6
5,Red,5
6,Graphite,4
7,Midnight Green,4
8,Pacific Blue,4
9,Blue,2


#

#### 8. find total number of models by iphone version : eg
iphone 8:  9

iphone XR : 5

so on..

In [31]:
# Define a function to iphone version based on a pattern
def extract_version(product_name):
    # Split the product name by '(' and ')' to get parts within parentheses
    product_split = product_name.split('iPhone')
        
    if len(product_split) > 1:
        # Check if the second part contains a comma (,) to identify color
        version = product_split[1].strip().split(' ')[0]
        return 'iphone ' + version
    return None

In [32]:
# Apply the extract_color function to create a 'Version' column
df['Version'] = df['Product_Name'].apply(extract_version)

In [33]:
# Count the number of models for each version
version_counts = df['Version'].value_counts().reset_index()
version_counts

Unnamed: 0,Version,count
0,iphone 12,26
1,iphone 11,16
2,iphone 8,8
3,iphone SE,6
4,iphone XR,5
5,iphone XS,1


#

#### 9. list top 5 models having highest no of reviews 

In [34]:
review_df = df.sort_values(by ='Number_Of_Reviews', ascending=False)

#

#### 10. what is the price diffrence between highest price and lowest price iphone (based on mrp)

In [36]:
prize_difference  = df['Mrp'].max() - df['Mrp'].min()

In [37]:
print('prize_difference:', prize_difference)

prize_difference: 110000


#

#### 11. find total no of reviews for iphone 11 and iphone 12 category . Output should have only 2 rows (for 11 and 12).

In [69]:
# Filter the DataFrame for iPhone 11 and iPhone 12 categories
iphone_11_reviews = df[df['Product_Name'].str.contains('iPhone 11', case=False)]
iphone_12_reviews = df[df['Product_Name'].str.contains('iPhone 12', case=False)]


In [70]:
# Calculate the total number of reviews for each category
total_reviews_11 = iphone_11_reviews['Number_Of_Reviews'].sum()
total_reviews_12 = iphone_12_reviews['Number_Of_Reviews'].sum()

In [71]:
# Create a new DataFrame with the results
result_df = pd.DataFrame({
    'iPhone Category': ['iPhone 11', 'iPhone 12'],
    'Total Reviews': [total_reviews_11, total_reviews_12]
})

In [72]:
result_df

Unnamed: 0,iPhone Category,Total Reviews
0,iPhone 11,25965
1,iPhone 12,2208


#

#### 12. which iphone has 3rd highest MRP

In [77]:
sorted_mrp_df = df.sort_values(by='Mrp', ascending=False).reset_index(drop=True)

In [79]:
# Get the iPhone with the 3rd highest MRP
third_highest_mrp_iphone = sorted_mrp_df.loc[2]

In [80]:
print(third_highest_mrp_iphone)

Product_Name                    APPLE iPhone 11 Pro (Space Grey, 512 GB)
Product_URL            https://www.flipkart.com/apple-iphone-11-pro-s...
Brand                                                              Apple
Sale_Price                                                        117900
Mrp                                                               140300
Number_Of_Ratings                                                   7088
Number_Of_Reviews                                                    523
Upc                                                     MOBFKCTSRTHRQTFT
Star_Rating                                                          4.6
Ram                                                                 4 GB
Discount_Percentage                                            15.965788
Color                                                         Space Grey
Version                                                        iphone 11
Name: 2, dtype: object


#

#### 13. what is the average mrp of iphones which costs above 100,000

In [98]:
# Filter the DataFrame to include only iPhones with MRP above 100,000
high_mrp_iphones = df[df['Mrp'] > 100000]

In [99]:
# Calculate the average MRP for the filtered iPhones
average_mrp = high_mrp_iphones['Mrp'].mean()

In [100]:
# Print the average MRP
print(average_mrp)

130559.09090909091


#

#### 14. which iphone with 128 GB space has highest ratings to review ratio

In [91]:
# Filter the DataFrame to include only iPhones with 128 GB of space
iphone_128gb = df[df['Storage_space'] == '128 GB']

In [93]:
# Calculate the ratings-to-reviews ratio for each iPhone
iphone_128gb['Ratings_to_Reviews_Ratio'] = iphone_128gb['Number_Of_Ratings'] / iphone_128gb['Number_Of_Reviews']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iphone_128gb['Ratings_to_Reviews_Ratio'] = iphone_128gb['Number_Of_Ratings'] / iphone_128gb['Number_Of_Reviews']


In [94]:
# Find the iPhone with the highest ratings-to-reviews ratio
highest_ratio_iphone = iphone_128gb.loc[iphone_128gb['Ratings_to_Reviews_Ratio'].idxmax()]

In [95]:
highest_ratio_iphone

Product_Name                                  APPLE iPhone 11 (Black, 128 GB)
Product_URL                 https://www.flipkart.com/apple-iphone-11-black...
Brand                                                                   Apple
Sale_Price                                                              54999
Mrp                                                                     59900
Number_Of_Ratings                                                       43470
Number_Of_Reviews                                                        3331
Upc                                                          MOBFWQ6BKRYBP5X8
Star_Rating                                                               4.6
Ram                                                                      4 GB
Discount_Percentage                                                   8.18197
Color                                                                   Black
Version                                                         