In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data/kickstarter_projects.csv')

# Convert column names to lowercase
df.columns = df.columns.str.lower()

# Drop variables 'id', 'name', and rows where 'state' is 'suspended' or 'goal' is 0
dropped_df = df.drop(columns=['id', 'name'])
dropped_df = dropped_df.drop(df[df.state == 'Live'].index)
dropped_df = dropped_df.drop(df[df.state == 'Suspended'].index)
dropped_df = dropped_df.drop(df[df.goal == 0].index)

# Replace 'Canceled' with 'Failed'
dropped_df.state.replace({'Canceled': 'Failed'}, inplace = True)

# Convert 'launched' and 'deadline' columns to datetime
dropped_df['launched'] = pd.to_datetime(dropped_df['launched'])
dropped_df['deadline'] = pd.to_datetime(dropped_df['deadline'])

# Calculate the duration in days
dropped_df['duration'] = (dropped_df['deadline'] - dropped_df['launched']).dt.days

# Create a new column 'launch_month' with the month extracted from the 'launched' column
dropped_df['launch_month'] = dropped_df['launched'].dt.month

df = dropped_df

df.head()

Unnamed: 0,category,subcategory,country,launched,deadline,goal,pledged,backers,state,duration,launch_month
0,Fashion,Fashion,United States,2009-04-21 21:02:48,2009-05-31,1000,625,30,Failed,39,4
1,Film & Video,Shorts,United States,2009-04-23 00:07:53,2009-07-20,80000,22,3,Failed,87,4
2,Art,Illustration,United States,2009-04-24 21:52:03,2009-05-03,20,35,3,Successful,8,4
3,Technology,Software,United States,2009-04-25 17:36:21,2009-07-14,99,145,25,Successful,79,4
4,Fashion,Fashion,United States,2009-04-27 14:10:39,2009-05-26,1900,387,10,Failed,28,4


In [2]:
df.state.value_counts()

state
Failed        236360
Successful    133849
Name: count, dtype: int64

In [3]:
from sklearn.preprocessing import LabelEncoder

# Label encoding
columns_to_encode = df[['category', 'subcategory', 'country']]
# Initialize LabelEncoder
label_encoder = LabelEncoder()
# Apply LabelEncoder to each categorical column
for col in columns_to_encode:
    df[col + '_encoded'] = label_encoder.fit_transform(df[col])

# Create a DataFrame to store unique original values and their corresponding encoded values for each column
encoded_values_dfs = {}
# Iterate through the columns to be encoded
for col in columns_to_encode:
    # Get unique values of the original column and their corresponding encoded values
    unique_original_values = df[col].unique()
    encoded_values = label_encoder.fit_transform(unique_original_values)
    # Create a DataFrame with original and encoded values
    encoded_values_df = pd.DataFrame({
        f'{col}_Original': unique_original_values,
        f'{col}_Encoded': encoded_values
    })
    # Sort the DataFrame by the encoded values (for 'Category' column)
    if col == 'Category':
        encoded_values_df = encoded_values_df.sort_values(by=f'{col}_Encoded', ascending=True)
    # Store the DataFrame in the dictionary
    encoded_values_dfs[col] = encoded_values_df
# Display the DataFrames
for col, encoded_values_df in encoded_values_dfs.items():
    print(f'\n{col}:\n')
    print(encoded_values_df)


category:

   category_Original  category_Encoded
0            Fashion                 5
1       Film & Video                 6
2                Art                 0
3         Technology                13
4         Journalism                 9
5         Publishing                12
6            Theater                14
7              Music                10
8        Photography                11
9              Games                 8
10            Design                 4
11              Food                 7
12            Crafts                 2
13            Comics                 1
14             Dance                 3

subcategory:

    subcategory_Original  subcategory_Encoded
0                Fashion                   52
1                 Shorts                  129
2           Illustration                   70
3               Software                  131
4             Journalism                   77
..                   ...                  ...
154        Playing Cards   

In [4]:
encoded_values_dfs

{'category':    category_Original  category_Encoded
 0            Fashion                 5
 1       Film & Video                 6
 2                Art                 0
 3         Technology                13
 4         Journalism                 9
 5         Publishing                12
 6            Theater                14
 7              Music                10
 8        Photography                11
 9              Games                 8
 10            Design                 4
 11              Food                 7
 12            Crafts                 2
 13            Comics                 1
 14             Dance                 3,
 'subcategory':     subcategory_Original  subcategory_Encoded
 0                Fashion                   52
 1                 Shorts                  129
 2           Illustration                   70
 3               Software                  131
 4             Journalism                   77
 ..                   ...                  ...
 15

In [5]:
df.head()

Unnamed: 0,category,subcategory,country,launched,deadline,goal,pledged,backers,state,duration,launch_month,category_encoded,subcategory_encoded,country_encoded
0,Fashion,Fashion,United States,2009-04-21 21:02:48,2009-05-31,1000,625,30,Failed,39,4,5,52,21
1,Film & Video,Shorts,United States,2009-04-23 00:07:53,2009-07-20,80000,22,3,Failed,87,4,6,129,21
2,Art,Illustration,United States,2009-04-24 21:52:03,2009-05-03,20,35,3,Successful,8,4,0,70,21
3,Technology,Software,United States,2009-04-25 17:36:21,2009-07-14,99,145,25,Successful,79,4,13,131,21
4,Fashion,Fashion,United States,2009-04-27 14:10:39,2009-05-26,1900,387,10,Failed,28,4,5,52,21


In [6]:
# Save new dataset
df.to_csv('data/cleaned_encoded_kickstarter_projects.csv', index=False)