In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('Set2')


In [None]:
df = pd.read_csv('amazon.csv')
df = df.dropna()
print(df.isnull().sum())

In [None]:
display(df.head())

In [None]:
print('\nTipos de datos:')
print(df.dtypes)

In [None]:
from seaborn import categorical


numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

In [None]:
df['actual_price'] = df['actual_price'].replace({'₹': '', ',': ''}, regex= True).astype(float)
average_dp = df['actual_price'].mean()
min_ap = df['actual_price'].min()
mid_ap = df['actual_price'].median()
max_ap = df['actual_price'].max()
range_ap = max_ap - min_ap
print(f'Precio Promedio: {average_dp}')
print(f'Precio Minimo: {min_ap}')
print(f'Precio Mediano: {mid_ap}')
print(f'Precio Maximo: {max_ap}')
print(f'Rango del Precio: {range_ap}')


In [None]:
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
average_rating = round(df['rating'].mean())
min_rating = df['rating'].min()
max_rating = df['rating'].max()
mid_rating = df['rating'].median()
range_rating = max_rating - min_rating
print("Rating Promedio:", average_rating)
print("Rating Minimo:", min_rating)
print("Rating Maximo:", max_rating)
print("Rating Medio:", mid_rating)
print("Rango del Rating:", range_rating)

In [None]:
df['discount_percentage'] = df['discount_percentage'].str.replace('%', '').astype(float)
average_dp = round(df['discount_percentage'].mean())
min_dp = round(df['discount_percentage'].min())
max_dp = round(df['discount_percentage'].max())
mid_dp = round(df['discount_percentage'].median())
range_dp = round(max_dp - min_dp)
print(f'Promedio del porcentaje de descuento: {average_dp}%')
print(f'Porcentaje de descuento minimo: {min_dp}%')
print(f'Porcentaje de descuento maximo: {max_dp}%')
print(f'Porcentaje de descuento medio: {mid_dp}%')
print(f'Rango del Porcentaje de descuento: {range_dp}%')

In [None]:
if 'category' in df.columns:
        print("\nDistribucion de las categorias de los productos:")
        category_counts = df['category'].value_counts()
        print(category_counts)

In [None]:
top_categories = category_counts.nlargest(5).index
print("\nEstadisticas de las mejores 5 categorias:")
for category in top_categories:
    subset = df[df['category'] == category]
    print(f"\nCategoria: {category} (Count: {len(subset)})")
    if 'actual_price' in df.columns:
        print(f"Preciom Promedio: ${subset['actual_price'].mean():.2f}")
    if 'rating' in df.columns:
        print(f"Rating Promedio: {subset['rating'].mean():.2f}")

In [None]:
if 'actual_price' in df.columns:
    plt.figure(figsize=(10, 6))
    plt.hist(df['actual_price'], bins=20, color='skyblue', edgecolor='black')
    plt.title('Distribucion del precio')
    plt.xlabel('Precio')
    plt.ylabel('Frecuencia')
    plt.grid(True)
    plt.show()

In [None]:
if 'rating' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(x='rating', data=df, palette='coolwarm')
    plt.title('Distribucion de los ratings')
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
    

In [None]:
if 'category' in df.columns:
        plt.figure(figsize=(12, 8))
        top_cats = df['category'].value_counts().nlargest(10)
        sns.barplot(y=top_cats.index, x=top_cats.values, palette='coolwarm')
        plt.title('Top 10 Cateogrias de los productos')
        plt.xlabel('Count')
        plt.ylabel('Categoria')
        plt.tight_layout()
        plt.show()

In [None]:
if 'actual_price' in df.columns and 'rating' in df.columns:
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x='actual_price', y='rating', data=df, alpha=0.6)
        plt.title('Precio comparado con el Rating')
        plt.xlabel('Precio ($)')
        plt.ylabel('Rating')
        plt.xlim(0, df['actual_price'].quantile(0.95))
        plt.tight_layout()
        plt.show()

In [None]:
if 'category' in df.columns and 'actual_price' in df.columns:
        top_5_categories = df['category'].value_counts().nlargest(5).index
        plt.figure(figsize=(12, 8))
        category_subset = df[df['category'].isin(top_5_categories)]
        sns.boxplot(y='category', x='actual_price', data=category_subset, palette='coolwarm')
        plt.title('Distribucioon de los precios por las top 5 categorias')
        plt.xlabel('Precio ($)')
        plt.ylabel('Categoria')
        plt.xlim(0, category_subset['actual_price'].quantile(0.95))
        plt.tight_layout()
        plt.show()

In [None]:
if 'category' in df.columns and 'rating' in df.columns:
        top_10_categories = df['category'].value_counts().nlargest(10).index
        category_subset = df[df['category'].isin(top_10_categories)]
        avg_ratings = category_subset.groupby('category')['rating'].mean().sort_values(ascending=False)
        
        plt.figure(figsize=(12, 8))
        sns.barplot(y=avg_ratings.index, x=avg_ratings.values, palette='coolwarm')
        plt.title('Rating Promedio por las top 10 categorias')
        plt.xlabel('Rating promedio')
        plt.ylabel('Categoria')
        plt.xlim(0, 5)
        plt.tight_layout()
        plt.show()
