## Forecasting Philanthropy: A Predictive Analysis for Donors Supporting Various School Projects in the USA


#### Step-1: Imports

In [1]:
import numpy as np #numpy for working on arrays
import pandas as pd #pandas for working and creating our own datasets
import datetime as dt #Datetime module supplies classes to work with date and time. These classes provide a number of functions to deal with dates, times and time intervals.
import datetime as dt

#### Step-2: Loading the Dataset

In [2]:
donations = pd.read_csv('../Dataset/Donations.csv')
donors = pd.read_csv('../Dataset/Donors.csv')
projects = pd.read_csv('../Dataset/Projects.csv')
schools = pd.read_csv('../Dataset/Schools.csv')

  donors = pd.read_csv('../Dataset/Donors.csv')


In [3]:
donations_copy = donations.copy()
donors_copy = donors.copy()
projects_copy = projects.copy()
schools_copy = schools.copy()

#### Step-3: Data Cleaning

In [4]:
#Remove the missing values from the dataset

def cleanData(data):
    missing_values = ['$', '%', 'null', 'Null', 'none', 'None', '?', ' ?', '', ' ', None, 'unknown']
    data_null = data.replace(missing_values, pd.NA)

    #rows
    data_missing_values_rows = data_null.isna().sum(axis=1).value_counts()
    
    #columns
    data_missing_values_cols = data_null.isna().sum(axis=0)
    data_missing_values_cols_total = data_missing_values_cols.value_counts()

    return data_null, data_missing_values_rows, data_missing_values_cols, data_missing_values_cols_total

In [5]:
print('Schools')
schools_copy, schools_missing_values_rows, schools_missing_values_cols, schools_missing_values_cols_total = cleanData(schools_copy)
print(f'\nMissing values in rows : \n{schools_missing_values_rows}')
print(f'\nMissing values in each column : \n{schools_missing_values_cols}')
print(f'\nTotal missing values in column : \n{schools_missing_values_cols_total}')

# Remove rows with missing values
schools_clean = schools_copy.dropna()

schools_clean = schools_clean[schools_clean['School State'] == 'Arizona']

# Drop the column that has the highest number of missing values and not required for the project
schools_clean = schools_clean.drop(['School State', 'School County', 'School District', 'School City', 'School Percentage Free Lunch', 'School Zip', 'School Name'], axis=1)

# Keep only unique School IDs
schools_clean = schools_clean.drop_duplicates(subset=['School ID'])

# Removing duplicate rows
schools_clean.drop_duplicates(inplace=True)

print(schools_clean.head())

print('\nSchools Clean Data')
schools_clean, schools_missing_values_rows, schools_missing_values_cols, schools_missing_values_cols_total = cleanData(schools_clean)
print(f'\nMissing values in rows : \n{schools_missing_values_rows}')
print(f'\nMissing values in each column : \n{schools_missing_values_cols}')
print(f'\nTotal missing values in column : \n{schools_missing_values_cols_total}')

Schools

Missing values in rows : 
0    64463
1     7577
2      941
3       12
Name: count, dtype: int64

Missing values in each column : 
School ID                          0
School Name                        0
School Metro Type               8125
School Percentage Free Lunch    1141
School State                       0
School Zip                         0
School City                      227
School County                      2
School District                    0
dtype: int64

Total missing values in column : 
0       5
8125    1
1141    1
227     1
2       1
Name: count, dtype: int64
                            School ID School Metro Type
31   00134665ab6fca630c64ae4e9a6c6d18          suburban
163  0095b620d638e5b176facf472582a717             urban
184  00a464584e724cfc27ce053bd469a214             urban
290  01064bd06f94098b8646e18a8a7230b5             urban
330  0129e949920d23a48f6aaf12bddd86a8             urban

Schools Clean Data

Missing values in rows : 
0    1273
Name: count

In [6]:
print('Projects')
projects_copy, projects_missing_values_rows, projects_missing_values_cols, projects_missing_values_cols_total = cleanData(projects_copy)
print(f'\nMissing values in rows : \n{projects_missing_values_rows}')
print(f'\nMissing values in each column : \n{projects_missing_values_cols}')
print(f'\nTotal missing values in column : \n{projects_missing_values_cols_total}')

# Drop the column that has the highest number of missing values and not required for the project
projects_clean = projects_copy.drop([
        'Project Fully Funded Date', 'Project Short Description', 'Project Essay', 'Project Need Statement', 'Project Title', 'Project Type', 'Project Resource Category',
        'Project Posted Date', 'Project Expiration Date', 'Teacher Project Posted Sequence', 'Teacher ID', 'Project Subject Category Tree'
    ], axis=1)

# Remove rows with missing values
projects_clean = projects_clean.dropna()

# Keep only unique Project ID
projects_clean = projects_clean.drop_duplicates(subset=['Project ID'])

# Extract unique School ID from school_clean
valid_school_ids = schools_clean['School ID'].unique()

# Filter projects_clean to keep only rows with School ID present in valid_school_ids
projects_clean = projects_clean[projects_clean['School ID'].isin(valid_school_ids)]

# Removing duplicate rows
projects_clean.drop_duplicates(inplace=True)

print('\nProjects Clean Data')
projects_clean, projects_missing_values_rows, projects_missing_values_cols, projects_missing_values_cols_total = cleanData(projects_clean)
print(f'\nMissing values in rows : \n{projects_missing_values_rows}')
print(f'\nMissing values in each column : \n{projects_missing_values_cols}')
print(f'\nTotal missing values in column : \n{projects_missing_values_cols_total}')

Projects

Missing values in rows : 
0    826698
1    283259
2        29
3        15
4        14
5         2
Name: count, dtype: int64

Missing values in each column : 
Project ID                               0
School ID                                0
Teacher ID                               0
Teacher Project Posted Sequence          0
Project Type                             0
Project Title                            6
Project Essay                            1
Project Short Description                4
Project Need Statement                   3
Project Subject Category Tree           29
Project Subject Subcategory Tree        29
Project Grade Level Category            53
Project Resource Category               36
Project Cost                             0
Project Posted Date                      0
Project Expiration Date                 14
Project Current Status                   0
Project Fully Funded Date           283253
dtype: int64

Total missing values in column : 
0         

In [7]:
print('Donations')
donations_copy, donations_missing_values_rows, donations_missing_values_cols, donations_missing_values_cols_total = cleanData(donations_copy)
print(f'\nMissing values in rows : \n{donations_missing_values_rows}')
print(f'\nMissing values in each column : \n{donations_missing_values_cols}')
print(f'\nTotal missing values in column : \n{donations_missing_values_cols_total}')

donations_copy = donations_copy.dropna()

# Drop the column that has the highest number of missing values and not required for the project
donations_copy = donations_copy.drop(['Donation Received Date', 'Donor Cart Sequence', 'Donation Included Optional Donation', 'Donation ID'], axis=1)

donations_copy = donations_copy[(donations_copy['Donation Amount'] > 0)]

# Extract unique Project ID from projects_clean
valid_project_ids = projects_clean['Project ID'].unique()

# Filter donations_copy to keep only rows with Project ID present in valid_project_ids
donations_copy = donations_copy[donations_copy['Project ID'].isin(valid_project_ids)]
donations_copy = donations_copy.drop_duplicates(subset=['Project ID'])

donations_copy.drop_duplicates(inplace=True)

print('Donations Clean Data')
donations_clean, donations_missing_values_rows, donations_missing_values_cols, donations_missing_values_cols_total = cleanData(donations_copy)
print(f'\nMissing values in rows : \n{donations_missing_values_rows}')
print(f'\nMissing values in each column : \n{donations_missing_values_cols}')
print(f'\nTotal missing values in column : \n{donations_missing_values_cols_total}')

Donations

Missing values in rows : 
0    4687884
Name: count, dtype: int64

Missing values in each column : 
Project ID                             0
Donation ID                            0
Donor ID                               0
Donation Included Optional Donation    0
Donation Amount                        0
Donor Cart Sequence                    0
Donation Received Date                 0
dtype: int64

Total missing values in column : 
0    7
Name: count, dtype: int64
Donations Clean Data

Missing values in rows : 
0    15590
Name: count, dtype: int64

Missing values in each column : 
Project ID         0
Donor ID           0
Donation Amount    0
dtype: int64

Total missing values in column : 
0    3
Name: count, dtype: int64


In [8]:
print('Donors')
donors_copy, donors_missing_values_rows, donors_missing_values_cols, donors_missing_values_cols_total = cleanData(donors_copy)
print(f'\nMissing values in rows : \n{donors_missing_values_rows}')
print(f'\nMissing values in each column : \n{donors_missing_values_cols}')
print(f'\nTotal missing values in column : \n{donors_missing_values_cols_total}')

# Drop the column that has the highest number of missing values
donors_clean = donors_copy.drop(['Donor City', 'Donor Zip', 'Donor State'], axis=1)

# Remove rows with missing values
donors_clean = donors_clean.dropna()

# Keep only unique Donor ID
donors_clean = donors_clean.drop_duplicates(subset=['Donor ID'])

# Extract unique Donor ID from donations_clean
valid_donor_ids = donations_clean['Donor ID'].unique()

# Filter donors_clean to keep only rows with Donor ID present in valid_donor_ids
donors_clean = donors_clean[donors_clean['Donor ID'].isin(valid_donor_ids)]

# Removing duplicate rows
donors_clean.drop_duplicates(inplace=True)

print('\nDonors Clean Data')
donors_clean, donors_missing_values_rows, donors_missing_values_cols, donors_missing_values_cols_total = cleanData(donors_clean)
print(f'\nMissing values in rows : \n{donors_missing_values_rows}')
print(f'\nMissing values in each column : \n{donors_missing_values_cols}')
print(f'\nTotal missing values in column : \n{donors_missing_values_cols_total}')

Donors

Missing values in rows : 
0    1909543
2     180060
1      33037
Name: count, dtype: int64

Missing values in each column : 
Donor ID                 0
Donor City          213097
Donor State              0
Donor Is Teacher         0
Donor Zip           180060
dtype: int64

Total missing values in column : 
0         3
213097    1
180060    1
Name: count, dtype: int64

Donors Clean Data

Missing values in rows : 
0    11135
Name: count, dtype: int64

Missing values in each column : 
Donor ID            0
Donor Is Teacher    0
dtype: int64

Total missing values in column : 
0    2
Name: count, dtype: int64


#### Step-4: Describing the Dataset

In [9]:
print('Shape of donations dataframe is:' , donations_clean.shape)
print('Shape of donors dataframe is:' , donors_clean.shape)
print('Shape of projects dataframe is:' , projects_clean.shape)
print('Shape of schools dataframe is:' , schools_clean.shape)

Shape of donations dataframe is: (15590, 3)
Shape of donors dataframe is: (11135, 2)
Shape of projects dataframe is: (19733, 6)
Shape of schools dataframe is: (1273, 2)


In [10]:
donations_clean.head()

Unnamed: 0,Project ID,Donor ID,Donation Amount
58,0000c0ea0aecb2ad60e8d234eab6ed28,f2d7c9f3d87bcf44586efba06069f9a3,15.0
1082,000c6d866dda54864643a929fa09017b,cf00d5c1876bb311d6b775906eda03f5,685.6
1219,000eff69eec6f822440ff0033f246ea7,62773cffc74ea1886095f74f8c41d0f3,100.0
1297,000fb8e93ce5c040979437889089e4c0,5171c9331baa47662ce4c96420d56b08,25.0
1351,00108305261d3e98754963b3af27f7be,2ee9c155bdd6592bb609af96bddc84bc,100.0


In [11]:
donations_clean.describe()

Unnamed: 0,Donation Amount
count,15590.0
mean,97.771584
std,304.531711
min,0.2
25%,25.0
50%,42.59
75%,100.0
max,14198.66


In [12]:
donors_clean.head()

Unnamed: 0,Donor ID,Donor Is Teacher
576,0011533f40444b613f6d3c028b0566d5,No
599,00120f847582d9da16787814c532d0e6,No
633,00131aacd0da4a48e31ae48075e81e13,No
647,0013b0b9fcb2f470123907dabd929254,Yes
1026,0020047941c1e294d7f5389b21e5ca95,No


In [13]:
donors_clean.describe()

Unnamed: 0,Donor ID,Donor Is Teacher
count,11135,11135
unique,11135,2
top,0011533f40444b613f6d3c028b0566d5,No
freq,1,8874


In [14]:
projects_clean.head()

Unnamed: 0,Project ID,School ID,Project Subject Subcategory Tree,Project Grade Level Category,Project Cost,Project Current Status
117,e395f090c8d78bb74fceda0be2bda323,4ad625f945b8081d0cf2070f83a4b14b,"Literature & Writing, Mathematics",Grades PreK-2,165.46,Fully Funded
262,49aa9ef05619a8381dbd92474b5396ca,4a26c1787697404f26581162c48bd740,"Literacy, Literature & Writing",Grades PreK-2,192.96,Fully Funded
282,d7ccc822deff43d11fc5890e48c4d0e4,1b2759ca4743b42447723adbaf8b0326,"Applied Sciences, Visual Arts",Grades 6-8,577.61,Fully Funded
289,b68a6f552c9024f5e4e34dc5b9363d1d,a83ca53b3b88d622a8091b5c65bf6c0e,Mathematics,Grades 9-12,8681.69,Expired
293,f9b7ea03b4a0ff3da4e62cb9f9f592e5,e7532772adcf4e033df92f8dc48933a8,Mathematics,Grades 3-5,416.79,Expired


In [15]:
projects_clean.describe()

Unnamed: 0,Project Cost
count,19733.0
mean,776.183573
std,893.300152
min,154.71
25%,328.64
50%,522.6
75%,886.79
max,30055.41


In [16]:
schools_clean.head()

Unnamed: 0,School ID,School Metro Type
31,00134665ab6fca630c64ae4e9a6c6d18,suburban
163,0095b620d638e5b176facf472582a717,urban
184,00a464584e724cfc27ce053bd469a214,urban
290,01064bd06f94098b8646e18a8a7230b5,urban
330,0129e949920d23a48f6aaf12bddd86a8,urban


In [17]:
schools_clean.describe()

Unnamed: 0,School ID,School Metro Type
count,1273,1273
unique,1273,4
top,00134665ab6fca630c64ae4e9a6c6d18,urban
freq,1,715


#### Step-5: Creating New Dataframe

In [18]:
data_temp_1 = pd.merge(donations_clean , projects_clean , how='inner' , on = 'Project ID')

In [19]:
data_temp_2 = pd.merge(data_temp_1 , donors_clean , how='inner' , on='Donor ID')

In [20]:
data = pd.merge(data_temp_2 , schools_clean , how='inner' , on='School ID')

In [21]:
data.shape

(15563, 10)

In [22]:
data_columns = data.columns.values.tolist()
data_columns

['Project ID',
 'Donor ID',
 'Donation Amount',
 'School ID',
 'Project Subject Subcategory Tree',
 'Project Grade Level Category',
 'Project Cost',
 'Project Current Status',
 'Donor Is Teacher',
 'School Metro Type']

#### Step-6: Data Transformation

- Nominal: 
    - Project ID
    - Donor ID
    - School ID
    - Project Subject Subcategory Tree
    - Donor Is Teacher

- Ordinal:
    - Project Grade Level Category
    - Project Current Status
    - School Metro Type

- Interval: No interval attributes

- Ratio:
    - Donation Amount
    - Project Cost

##### Step - 6.1: Transform Ratio Attributes - Create a new data frame and initialize it with the Ratio attribute

In [23]:
data_transform = data[[
    'Donation Amount', 'Project Cost'
]]
data_transform.head()

Unnamed: 0,Donation Amount,Project Cost
0,15.0,326.52
1,3.56,404.19
2,25.0,1082.62
3,20.0,355.32
4,25.0,296.82


##### Step - 6.2: Transform Categorical Attributes - Create dummies(0,1) and Concat to new data frame

In [24]:
categories = [
    'Project ID', 'Donor ID', 'School ID',
    'Project Subject Subcategory Tree', 'Donor Is Teacher'
]

for category in categories:
    dummy = pd.get_dummies(data[category], dtype=int, sparse=True)
    # Concat both dataframes
    data_transform = pd.concat([data_transform, dummy], axis=1)

In [25]:
data_transform.head()

Unnamed: 0,Donation Amount,Project Cost,0000c0ea0aecb2ad60e8d234eab6ed28,000c6d866dda54864643a929fa09017b,000eff69eec6f822440ff0033f246ea7,000fb8e93ce5c040979437889089e4c0,00108305261d3e98754963b3af27f7be,0012d94ac914624f70e45fb22206e47e,001eef45c3844900ca873b48eabadc7e,001fb874f899197daffeaa6a7335fac1,...,Special Needs,"Special Needs, Team Sports","Special Needs, Visual Arts","Special Needs, Warmth, Care & Hunger",Team Sports,"Team Sports, Visual Arts",Visual Arts,"Warmth, Care & Hunger",No,Yes
0,15.0,326.52,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,3.56,404.19,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,25.0,1082.62,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,20.0,355.32,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,25.0,296.82,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


##### Step - 6.3: Transform Ordinal Attributes - Get unique items, create dictionary, and map to dataframe

In [26]:
# Project Grade Level Category

# Get unique items
project_grade_lvl_cat = data['Project Grade Level Category'].unique().tolist()
print(f"Unique values of 'Project Grade Level Category' Column: \n{project_grade_lvl_cat}")

Unique values of 'Project Grade Level Category' Column: 
['Grades 3-5', 'Grades PreK-2', 'Grades 6-8', 'Grades 9-12']


In [27]:
# Create dictionary for this column
project_grade_lvl_cat_dict = {
    'Grades PreK-2': 1,
    'Grades 3-5': 2,
    'Grades 6-8': 3,
    'Grades 9-12': 4
}

# Create new column in new dataframe and add this column
data_transform['Project Grade Level Category'] = data['Project Grade Level Category'].map(project_grade_lvl_cat_dict)
data_transform.head()

Unnamed: 0,Donation Amount,Project Cost,0000c0ea0aecb2ad60e8d234eab6ed28,000c6d866dda54864643a929fa09017b,000eff69eec6f822440ff0033f246ea7,000fb8e93ce5c040979437889089e4c0,00108305261d3e98754963b3af27f7be,0012d94ac914624f70e45fb22206e47e,001eef45c3844900ca873b48eabadc7e,001fb874f899197daffeaa6a7335fac1,...,"Special Needs, Team Sports","Special Needs, Visual Arts","Special Needs, Warmth, Care & Hunger",Team Sports,"Team Sports, Visual Arts",Visual Arts,"Warmth, Care & Hunger",No,Yes,Project Grade Level Category
0,15.0,326.52,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2
1,3.56,404.19,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,25.0,1082.62,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,2
3,20.0,355.32,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,25.0,296.82,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [28]:
# Project Current Status

# Get unique items
project_curr_stat_cat = data['Project Current Status'].unique().tolist()
print(f"Unique values of 'Project Current Status' Column: \n{project_curr_stat_cat}")

Unique values of 'Project Current Status' Column: 
['Live', 'Fully Funded', 'Expired']


In [29]:
# Create dictionary for this column
project_curr_stat_cat_dict = {
    'Expired': 1,
    'Live': 2,
    'Fully Funded': 3
}

# Create new column in new dataframe and add this column
data_transform['Project Current Status'] = data['Project Current Status'].map(project_curr_stat_cat_dict)
data_transform.head()

Unnamed: 0,Donation Amount,Project Cost,0000c0ea0aecb2ad60e8d234eab6ed28,000c6d866dda54864643a929fa09017b,000eff69eec6f822440ff0033f246ea7,000fb8e93ce5c040979437889089e4c0,00108305261d3e98754963b3af27f7be,0012d94ac914624f70e45fb22206e47e,001eef45c3844900ca873b48eabadc7e,001fb874f899197daffeaa6a7335fac1,...,"Special Needs, Visual Arts","Special Needs, Warmth, Care & Hunger",Team Sports,"Team Sports, Visual Arts",Visual Arts,"Warmth, Care & Hunger",No,Yes,Project Grade Level Category,Project Current Status
0,15.0,326.52,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,2,2
1,3.56,404.19,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,3
2,25.0,1082.62,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,2,1
3,20.0,355.32,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,3
4,25.0,296.82,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,3


In [30]:
# School Metro Type

# Get unique items
school_metro_type_cat = data['School Metro Type'].unique().tolist()
print(f"Unique values of 'School Metro Type' Column: \n{school_metro_type_cat}")

Unique values of 'School Metro Type' Column: 
['urban', 'suburban', 'town', 'rural']


In [31]:
# Create dictionary for this column
school_metro_type_cat_dict = {
    'rural': 1,
    'town': 2,
    'suburban': 3,
    'urban': 4
}

# Create new column in new dataframe and add this column
data_transform['School Metro Type'] = data['School Metro Type'].map(school_metro_type_cat_dict)
data_transform.head()

Unnamed: 0,Donation Amount,Project Cost,0000c0ea0aecb2ad60e8d234eab6ed28,000c6d866dda54864643a929fa09017b,000eff69eec6f822440ff0033f246ea7,000fb8e93ce5c040979437889089e4c0,00108305261d3e98754963b3af27f7be,0012d94ac914624f70e45fb22206e47e,001eef45c3844900ca873b48eabadc7e,001fb874f899197daffeaa6a7335fac1,...,"Special Needs, Warmth, Care & Hunger",Team Sports,"Team Sports, Visual Arts",Visual Arts,"Warmth, Care & Hunger",No,Yes,Project Grade Level Category,Project Current Status,School Metro Type
0,15.0,326.52,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,2,2,4
1,3.56,404.19,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,3,4
2,25.0,1082.62,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,2,1,4
3,20.0,355.32,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,3,4
4,25.0,296.82,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,3,4


##### Step - 6.4: Transform Interval Attributes - Find minimum/earliest date, subtract from each values and update it as days

No interval attributes

In [32]:
# def intervalAttrTransform(col, data_transform, data):
#     minimum_datetime = pd.to_datetime(data[col]).min()
#     data[col] = pd.to_datetime(data[col]) - minimum_datetime
#     data_transform[col] = data[col].dt.days
#     return data_transform

# interval_cols = ['Column-1', 'Column-2']

# for col in interval_cols:
#     data_transform = intervalAttrTransform(col, data_transform, data)

# data_transform.head()

In [33]:
data_transform.shape

(15563, 28257)

In [34]:
# Transpose the DataFrame to make columns as rows
df_transposed = data_transform.T

# Drop duplicate columns (i.e., rows in the transposed DataFrame)
df_transposed = df_transposed.drop_duplicates()

# Transpose the DataFrame back to original form
df_unique_columns = df_transposed.T

data_transform = df_unique_columns.copy()

# data_transform.to_csv('../Dataset/Transformed_Data.csv', index=False)

data_transform.shape

(15563, 18572)

#### Step-7: Data Mining Techniques and Machine Learning Models

### Model 1: Decision Tree Regressor with SelectKBest

In [48]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.tree import DecisionTreeRegressor

transformedData = data_transform.copy()

# Splitting the dataset into features (X) and target variable (y)
X = transformedData.drop(columns=['Donation Amount'])
y = transformedData['Donation Amount']

# Convert column names to strings
X.columns = X.columns.astype(str)

# Select the top 1000 features using SelectKBest
selector = SelectKBest(score_func=f_regression, k=5000)
X_new = selector.fit_transform(X, y)

# First, split the dataset into training (80%) and temporary (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X_new, y, test_size=0.2, random_state=42)

# Then split the temporary set into separate validation (10%) and testing (10%) sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 0.5 x 0.2 = 0.1

# Define a smaller set of hyperparameters to search
param_dist = {
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize Decision Tree Regression model
model = DecisionTreeRegressor(random_state=42)

# Initialize RandomizedSearchCV with reduced iterations
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=3, cv=5, random_state=42, n_jobs=-1)

# Fit the model with the best hyperparameters
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Perform k-fold cross-validation on the training set
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)

# Print the cross-validation scores
print("Cross-validation scores: ", cv_scores)

# Make predictions on the validation data
y_val_pred = best_model.predict(X_val)

# Calculating the R-squared score (accuracy) on validation data
accuracy_val = r2_score(y_val, y_val_pred)

# Convert accuracy to percentage
accuracy_val_percentage = accuracy_val * 100

# Printing the validation accuracy in percentage
print("Validation Accuracy (R-squared Score): {:.2f}%".format(accuracy_val_percentage))

# Make predictions on the testing data
y_test_pred = best_model.predict(X_test)

# Calculating the R-squared score (accuracy) on testing data
accuracy_test = r2_score(y_test, y_test_pred)

# Convert accuracy to percentage
accuracy_test_percentage = accuracy_test * 100

# Printing the testing accuracy in percentage
print("Testing Accuracy (R-squared Score): {:.2f}%".format(accuracy_test_percentage))


Cross-validation scores:  [0.02213072 0.53351277 0.06317423 0.19296904 0.40127182]
Validation Accuracy (R-squared Score): 64.24%
Testing Accuracy (R-squared Score): 49.37%


### Model 2: Decision Tree Regressor

In [35]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

transformedData = data_transform.copy()

# Splitting the dataset into features (X) and target variable (y)
y = transformedData['Donation Amount']  # Target variable
X = transformedData.drop(columns=['Donation Amount'])  # Features

# Convert column names to strings
X.columns = X.columns.astype(str)

# First, split the dataset into training (80%) and temporary (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Then split the temporary set into separate validation (10%) and testing (10%) sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 0.5 x 0.2 = 0.1

# Initializing the Decision Tree Regression model with parallel processing
model = DecisionTreeRegressor(random_state=42)

# Perform k-fold cross-validation on the training set
cv_scores = cross_val_score(model, X_train, y_train, cv=3)

# Print the cross-validation scores
print("Cross-validation scores: ", cv_scores)

# Fitting the model on the training data
model.fit(X_train, y_train)

# Making predictions on the validation data
y_val_pred = model.predict(X_val)

# Calculating the R-squared score (accuracy) on validation data
accuracy_val = r2_score(y_val, y_val_pred)

# Convert accuracy to percentage
accuracy_val_percentage = accuracy_val * 100

# Printing the validation accuracy in percentage
print("Validation Accuracy (R-squared Score): {:.2f}%".format(accuracy_val_percentage))

# Making predictions on the testing data
y_test_pred = model.predict(X_test)

# Calculating the R-squared score (accuracy) on testing data
accuracy_test = r2_score(y_test, y_test_pred)

# Convert accuracy to percentage
accuracy_test_percentage = accuracy_test * 100

# Printing the testing accuracy in percentage
print("Testing Accuracy (R-squared Score): {:.2f}%".format(accuracy_test_percentage))

Cross-validation scores:  [-0.36657661  0.35926843  0.06874284]
Validation Accuracy (R-squared Score): 52.97%
Testing Accuracy (R-squared Score): 50.14%


### Model 3: Random Forest Regressor with SelectKBest

In [40]:
# Importing necessary libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectKBest, f_regression

transformedData = data_transform.copy()

# Splitting the dataset into features (X) and target variable (y)
X = transformedData.drop(columns=['Donation Amount'])
y = transformedData['Donation Amount']

# Convert column names to strings
X.columns = X.columns.astype(str)

# Select the 10 best features
selector = SelectKBest(score_func=f_regression, k=1500)
X_new = selector.fit_transform(X, y)

# First, split the dataset into training (80%) and temporary (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X_new, y, test_size=0.2, random_state=42)

# Then split the temporary set into separate validation (10%) and testing (10%) sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 0.5 x 0.2 = 0.1

# Initialize Random Forest Regression model
model = RandomForestRegressor(random_state=42)

# Define a smaller set of hyperparameters to search
param_dist = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=5, cv=3, random_state=42, n_jobs=-1)

# Fit the model with the best hyperparameters
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Perform k-fold cross-validation on the training set
cv_scores = cross_val_score(best_model, X_train, y_train, cv=3)

# Print the cross-validation scores
print("Cross-validation scores: ", cv_scores)

# Print the average cross-validation score
print("Average cross-validation score: {:.2f}".format(cv_scores.mean()))

# Make predictions on the validation data
y_val_pred = best_model.predict(X_val)

# Calculating the R-squared score (accuracy) on validation data
accuracy_val = r2_score(y_val, y_val_pred)

# Convert accuracy to percentage
accuracy_val_percentage = accuracy_val * 100

# Printing the validation accuracy in percentage
print("Validation Accuracy (R-squared Score): {:.2f}%".format(accuracy_val_percentage))

# Make predictions on the testing data
y_test_pred = best_model.predict(X_test)

# Calculating the R-squared score (accuracy) on testing data
accuracy_test = r2_score(y_test, y_test_pred)

# Convert accuracy to percentage
accuracy_test_percentage = accuracy_test * 100

# Printing the testing accuracy in percentage
print("Testing Accuracy (R-squared Score): {:.2f}%".format(accuracy_test_percentage))


Cross-validation scores:  [0.17835083 0.50276431 0.49208495]
Average cross-validation score: 0.39
Validation Accuracy (R-squared Score): 64.49%
Testing Accuracy (R-squared Score): 51.22%
