In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/cleaned_encoded_kickstarter_projects.csv')
df.head()

Unnamed: 0,category,subcategory,country,launched,deadline,goal,pledged,backers,state,duration,launch_month,category_encoded,subcategory_encoded,country_encoded
0,Fashion,Fashion,United States,2009-04-21 21:02:48,2009-05-31,1000,625,30,Failed,39,4,5,52,21
1,Film & Video,Shorts,United States,2009-04-23 00:07:53,2009-07-20,80000,22,3,Failed,87,4,6,129,21
2,Art,Illustration,United States,2009-04-24 21:52:03,2009-05-03,20,35,3,Successful,8,4,0,70,21
3,Technology,Software,United States,2009-04-25 17:36:21,2009-07-14,99,145,25,Successful,79,4,13,131,21
4,Fashion,Fashion,United States,2009-04-27 14:10:39,2009-05-26,1900,387,10,Failed,28,4,5,52,21


In [None]:
df.columns

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370209 entries, 0 to 370208
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   category             370209 non-null  object
 1   subcategory          370209 non-null  object
 2   country              370209 non-null  object
 3   launched             370209 non-null  object
 4   deadline             370209 non-null  object
 5   goal                 370209 non-null  int64 
 6   pledged              370209 non-null  int64 
 7   backers              370209 non-null  int64 
 8   state                370209 non-null  object
 9   duration             370209 non-null  int64 
 10  launch_month         370209 non-null  int64 
 11  category_encoded     370209 non-null  int64 
 12  subcategory_encoded  370209 non-null  int64 
 13  country_encoded      370209 non-null  int64 
dtypes: int64(8), object(6)
memory usage: 39.5+ MB


In [7]:
print(df.describe())

               goal       pledged        backers       duration   
count  3.702090e+05  3.702090e+05  370209.000000  370209.000000  \
mean   4.509281e+04  9.144729e+03     106.989822      33.168994   
std    1.128632e+06  9.158248e+04     915.277662      12.784716   
min    1.000000e+00  0.000000e+00       0.000000       0.000000   
25%    2.000000e+03  3.200000e+01       2.000000      29.000000   
50%    5.500000e+03  6.330000e+02      12.000000      29.000000   
75%    1.598700e+04  4.076000e+03      57.000000      36.000000   
max    1.663614e+08  2.033899e+07  219382.000000      91.000000   

        launch_month  category_encoded  subcategory_encoded  country_encoded  
count  370209.000000     370209.000000        370209.000000    370209.000000  
mean        6.428031          7.492746            80.738124        19.040207  
std         3.307246          3.914019            45.278376         5.257678  
min         1.000000          0.000000             0.000000         0.000000  
2

In [3]:
# Define feature and target variable
X = df[['goal', 'duration', 'launch_month', 'category_encoded', 'subcategory_encoded', 'country_encoded']]
y = df['pledged']

X.shape

(370209, 6)

In [4]:
# Apply logarithmic transformation to 'goal' and 'pledged' columns
X['goal'] = np.log1p(X['goal'])  # log1p is used to handle zero values
y = np.log1p(y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['goal'] = np.log1p(X['goal'])  # log1p is used to handle zero values


In [28]:
y.describe()

count    370209.000000
mean          5.771424
std           3.310949
min           0.000000
25%           3.496508
50%           6.452049
75%           8.313117
max          16.828050
Name: pledged, dtype: float64

In [None]:
# Fit linear regression model
lin_reg = LinearRegression()
lin_reg.fit(X, y)

# Calculate intercept and coefficient
intercept = lin_reg.intercept_
coefficients = lin_reg.coef_
print("Intercept:", intercept.round(5))
print("Coefficients:", coefficients.round(5))

In [None]:
df[['goal', 'duration', 'launch_month', 'category_encoded', 'subcategory_encoded', 'country_encoded']].corr()

In [None]:
# Calculate r-squared 
y_hat = lin_reg.predict(X)
print("R-squared:", r2_score(y, y_hat).round(3))

In [20]:
from sklearn.preprocessing import PolynomialFeatures

# Create PolynomialFeatures object with interaction_only=True
poly = PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)

# Fit and transform the data
interaction_features = poly.fit_transform(X)

# Create a new df with interaction features
feature_names = poly.get_feature_names_out(input_features=X.columns)
interaction_df = pd.DataFrame(interaction_features, columns=feature_names)

In [21]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(interaction_df, y, test_size=0.25, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
# Train the linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [23]:
# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Inverse transform the predictions to the original scale
y_pred_original_scale = np.expm1(y_pred)  # Inverse of log1p is expm1
y_test_original_scale = np.expm1(y_test)

# Evaluating the model with original target values
mse_original_scale = mean_squared_error(y_test_original_scale, y_pred_original_scale)
print(f'Mean Squared Error (Original Scale): {mse_original_scale}')

Mean Squared Error: 10.565134592148606
Mean Squared Error (Original Scale): 7837402788.903474
