<a href="https://colab.research.google.com/github/sanasiddique-97/Automation-of-bank-annexure-/blob/main/Copy_of_Untitled20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Generating sample Google Play Store data
data = {
    'App': [f'App_{i}' for i in range(1, 31)],
    'Category': np.random.choice(['Game', 'Productivity', 'Health', 'Education'], 30),
    'Rating': np.random.uniform(1.0, 5.0, 30).round(1),
    'Reviews': np.random.randint(1, 10000, 30),
    'Size': np.random.choice(['Varies with device', '10M', '20M', '30M'], 30),
    'Installs': np.random.choice(['1,000+', '10,000+', '50,000+', '100,000+'], 30),
    'Type': np.random.choice(['Free', 'Paid'], 30),
    'Price': np.random.choice(['0', '0.99', '1.99', '2.99', '4.99', '5.99', '9.99'], 30),
    'Content Rating': np.random.choice(['Everyone', 'Teen', 'Everyone 10+', 'Mature 17+'], 30)
}

# Creating DataFrame
df = pd.DataFrame(data)

# Basic data cleansing
# Step 1: Convert 'Price' to numeric, replacing '$' and converting 'Free' to 0
df['Price'] = df['Price'].replace('Free', '0').str.replace('$', '').astype(float)

# Additional Steps
# Step 2: Remove duplicates based on the 'App' column
df = df.drop_duplicates(subset=['App'])

# Step 3: Fill missing values in 'Rating' with the mean rating
df['Rating'] = df['Rating'].fillna(df['Rating'].mean())

# Step 4: Normalize the 'Reviews' column (scaling between 0 and 1)
df['Reviews'] = (df['Reviews'] - df['Reviews'].min()) / (df['Reviews'].max() - df['Reviews'].min())

# Step 5: Simplify 'Size' column (replace 'Varies with device' with NaN)
df['Size'] = df['Size'].replace('Varies with device', np.nan)

# Step 6: Convert 'Size' from string to numeric (assume 'M' means MB)
df['Size'] = df['Size'].str.replace('M', '').astype(float)

# Step 7: Fill missing values in 'Size' with the median size
df['Size'] = df['Size'].fillna(df['Size'].median())

# Step 8: Clean 'Installs' column (remove '+' and ',')
df['Installs'] = df['Installs'].str.replace(',', '').str.replace('+', '').astype(int)

# Step 9: Create a new column 'Log_Installs' (log transformation of 'Installs')
df['Log_Installs'] = np.log1p(df['Installs'])

# Step 10: Normalize 'Log_Installs' column
df['Log_Installs'] = (df['Log_Installs'] - df['Log_Installs'].min()) / (df['Log_Installs'].max() - df['Log_Installs'].min())

# Step 11: Create a new column 'Popularity' based on 'Rating' and 'Reviews'
df['Popularity'] = df['Rating'] * df['Reviews']

# Check and categorize paid apps
df['Affordable'] = np.where(df['Type'] == 'Paid', np.where(df['Price'] < 5.00, 'Yes', 'No'), 'N/A')

# Display the DataFrame
print(df)

       App      Category  Rating   Reviews  Size  Installs  Type  Price  \
0    App_1        Health     1.0  1.000000  30.0     50000  Free   5.99   
1    App_2  Productivity     2.1  0.106407  30.0     50000  Paid   0.99   
2    App_3          Game     1.6  0.185760  20.0    100000  Free   5.99   
3    App_4          Game     2.4  0.170251  10.0     10000  Paid   9.99   
4    App_5     Education     4.8  0.519697  20.0     10000  Free   1.99   
5    App_6          Game     1.5  0.061580  20.0    100000  Free   0.99   
6    App_7  Productivity     1.3  0.708173  10.0      1000  Free   2.99   
7    App_8     Education     1.2  0.627236  20.0     10000  Free   2.99   
8    App_9          Game     1.9  0.681798  10.0    100000  Paid   0.99   
9   App_10          Game     3.2  0.594295  20.0     10000  Paid   2.99   
10  App_11        Health     4.3  0.346729  20.0     50000  Free   0.99   
11  App_12          Game     1.3  0.060109  10.0      1000  Free   1.99   
12  App_13          Game 