In [1]:
# Importing libraries
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv("renttherunway.csv")

# Check the first few samples, shape, and info of the data
print(data.head())
print(data.shape)
print(data.info())

   Unnamed: 0  fit  user_id bust size  item_id  weight  rating     rented for  \
0           0  fit   420272       34d  2260466  137lbs    10.0       vacation   
1           1  fit   273551       34b   153475  132lbs    10.0          other   
2           2  fit   360448       NaN  1063761     NaN    10.0          party   
3           3  fit   909926       34c   126335  135lbs     8.0  formal affair   
4           4  fit   151944       34b   616682  145lbs    10.0        wedding   

                                         review_text          body type  \
0  An adorable romper! Belt and zipper were a lit...          hourglass   
1  I rented this dress for a photo shoot. The the...  straight & narrow   
2  This hugged in all the right places! It was a ...                NaN   
3  I rented this for my company's black tie award...               pear   
4  I have always been petite in my upper body and...           athletic   

                                      review_summary category 

In [2]:
# Check for duplicate records
data.drop_duplicates(inplace=True)

# Drop redundant columns
data.drop(columns=['user_id','item_id','review_text','review_summary'], inplace=True)

In [3]:
# Clean 'weight' column
data['weight'] = data['weight'].str.replace('lbs', '').astype(float)

In [4]:
# Group 'party: cocktail' category with 'party'
data['rented for'] = data['rented for'].replace('party: cocktail', 'party')

In [12]:
# Function to convert height from feet and inches to inches
def convert_height(height):
    if pd.isnull(height):
        return None
    pattern = r"(\d+)' (\d+)"
    match = re.search(pattern, str(height))
    if match:
        feet = int(match.group(1))
        inches = int(match.group(2))
        total_inches = feet * 12 + inches
        return total_inches
    else:
        return None

# Apply the conversion function to the 'height' column
data['height'] = data['height'].apply(convert_height).astype(float)


In [None]:
# Check for missing values and impute them
missing_values = data.isnull().sum()
print(missing_values)

In [None]:
# Impute missing values
# For numerical columns, you can use mean, median, or mode
data['weight'].fillna(data['weight'].mean(), inplace=True)
data['age'].fillna(data['age'].median(), inplace=True)

In [None]:
# For categorical columns, you can use the mode
data['category'].fillna(data['category'].mode()[0], inplace=True)

# Check the statistical summary
data.describe()

In [None]:
# Treat outliers in the 'age' column using the IQR method
Q1 = data['age'].quantile(0.25)
Q3 = data['age'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data = data[(data['age'] >= lower_bound) & (data['age'] <= upper_bound)]

In [None]:
# Check the distribution of categories in the 'rented for' column
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='rented for')
plt.title('Distribution of Categories in "rented for"')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()

In [None]:
# Convert categorical columns to string type
categorical_cols = ['rented for', 'body type', 'category']
data[categorical_cols] = data[categorical_cols].astype(str)


In [10]:
# Encode categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_cols)