In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data/kickstarter_projects.csv')

# Convert column names to lowercase
df.columns = df.columns.str.lower()

# Drop variables 'id', 'name', and rows where 'state' is 'suspended' or 'goal' is 0
dropped_df = df.drop(columns=['id', 'name'])
dropped_df = dropped_df.drop(df[df.state == 'Live'].index)
dropped_df = dropped_df.drop(df[df.state == 'Suspended'].index)
dropped_df = dropped_df.drop(df[df.goal == 0].index)

# Replace 'Canceled' with 'Failed'
dropped_df.state.replace({'Canceled': 'Failed'}, inplace = True)

# Convert 'launched' and 'deadline' columns to datetime
dropped_df['launched'] = pd.to_datetime(dropped_df['launched'])
dropped_df['deadline'] = pd.to_datetime(dropped_df['deadline'])

# Calculate the duration in days
dropped_df['duration'] = (dropped_df['deadline'] - dropped_df['launched']).dt.days

# Create a new column 'launch_month' with the month extracted from the 'launched' column
dropped_df['launch_month'] = dropped_df['launched'].dt.month

df = dropped_df

df.head()

In [None]:
df.state.value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Label encoding
columns_to_encode = df[['category', 'subcategory', 'country']]
# Initialize LabelEncoder
label_encoder = LabelEncoder()
# Apply LabelEncoder to each categorical column
for col in columns_to_encode:
    df[col + '_encoded'] = label_encoder.fit_transform(df[col])

# Create a DataFrame to store unique original values and their corresponding encoded values for each column
encoded_values_dfs = {}
# Iterate through the columns to be encoded
for col in columns_to_encode:
    # Get unique values of the original column and their corresponding encoded values
    unique_original_values = df[col].unique()
    encoded_values = label_encoder.fit_transform(unique_original_values)
    # Create a DataFrame with original and encoded values
    encoded_values_df = pd.DataFrame({
        f'{col}_Original': unique_original_values,
        f'{col}_Encoded': encoded_values
    })
    # Sort the DataFrame by the encoded values (for 'Category' column)
    if col == 'Category':
        encoded_values_df = encoded_values_df.sort_values(by=f'{col}_Encoded', ascending=True)
    # Store the DataFrame in the dictionary
    encoded_values_dfs[col] = encoded_values_df
# Display the DataFrames
for col, encoded_values_df in encoded_values_dfs.items():
    print(f'\n{col}:\n')
    print(encoded_values_df)

In [None]:
encoded_values_dfs

In [None]:
df.head()

In [None]:
# Save new dataset
df.to_csv('data/cleaned_encoded_kickstarter_projects.csv', index=False)