In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import seaborn as sns
import numpy as np


In [6]:
project=pd.read_csv("projects.csv")
donation = pd.read_csv("donations.csv")
outcomes=pd.read_csv("outcomes.csv")

# creating timseries data

In [7]:
project_df=project
outcomes_df=outcomes
donation_df=donation
col =["projectid", "total_price_excluding_optional_support","date_posted"]
# only need data from the highest Poverty Level
project_df=project_df[project_df["poverty_level"]=="highest poverty"]
project_df=project_df[col]
project_df = pd.merge(project_df,outcomes_df[["projectid","fully_funded"]], on ="projectid", how ="left")
project_df["fully_funded"]=project_df.fully_funded.fillna('f')
project_df["date_posted"]=pd.to_datetime(project_df["date_posted"])
project_df["week"] = 0
dataframes = [project_df.copy()]
for week_shift in range(1, 5):
    # Duplicate DataFrame
    temp_df = project_df.copy()
    # Shift the date_posted column by month_shift months
    temp_df["date_posted"] = temp_df["date_posted"] + pd.DateOffset(months=week_shift)
    # Set the month column to the current shift
    temp_df["week"] = week_shift
    # Append the shifted DataFrame to the list
    dataframes.append(temp_df)
project_df_combined = pd.concat(dataframes, ignore_index=True)
# donation_df preparation
donation_df["donation_timestamp"]= pd.to_datetime(donation_df["donation_timestamp"].astype(str).str[:10]).dt.date
donation_df=donation_df[["projectid","donation_timestamp","donation_to_project"]]
df = pd.merge(project_df_combined, donation_df, on="projectid", how="left")
df["donation_to_project"]=df["donation_to_project"].fillna(0)
# put all the donation before project posted at the start of the project date.
df.loc[(df['week'] == 0) &
        (df['donation_timestamp'].notna()) & 
        (df['donation_timestamp'] < df['date_posted']), "donation_timestamp"] = df["date_posted"]

intermedia = df[(df.donation_timestamp.isnull()) | ((df.donation_timestamp<= df.date_posted) & (df.donation_timestamp > df.date_posted - pd.DateOffset(months=1)))]
intermedia = intermedia.groupby(["projectid","week"]).sum("donation_to_project").reset_index()[["projectid","week","donation_to_project"]]
project_time_series = pd.merge(project_df_combined, intermedia, on=["projectid","week"], how="left")
project_time_series["donation_to_project"]=project_time_series["donation_to_project"].fillna(0)
project_time_series["donation_sum"]= project_time_series.groupby('projectid')['donation_to_project'].cumsum()
project_time_series["percentage complete"]=project_time_series['donation_sum']/project_time_series['total_price_excluding_optional_support']
project_time_series[(project_time_series["percentage complete"]>=1)&(project_time_series["fully_funded"]=="t")].projectid.nunique()


221418

# creating other features

In [8]:
project_TS_NFF = project_time_series[(project_time_series["percentage complete"]<1)& (project_time_series["week"]<=4)]
col =["projectid","school_longitude","school_latitude", 'teacher_teach_for_america',
      'resource_type', "primary_focus_subject", "primary_focus_area","grade_level",
        "eligible_double_your_impact_match","eligible_almost_home_match","school_charter","school_magnet","school_year_round","school_nlns","school_kipp","school_charter_ready_promise"]
project_attribute=project[col]
binary_col=["teacher_teach_for_america","eligible_double_your_impact_match","eligible_almost_home_match","school_charter","school_magnet","school_year_round","school_nlns","school_kipp","school_charter_ready_promise"]
project_attribute[binary_col] = project_attribute[binary_col].replace({'t': 1, 'f': 0})
project_attribute = pd.get_dummies(project_attribute, columns=['primary_focus_subject', 
                                        'primary_focus_area', 
                                        'resource_type','grade_level'])
project_TS_NFF=pd.merge(project_TS_NFF,project_attribute, on="projectid", how="left")
project_TS_NFF["month"]=project_TS_NFF.date_posted.dt.month
project_TS_NFF["fully_funded"]=project_TS_NFF["fully_funded"].map({"t":0, "f":1})
project_TS_NFF= project_TS_NFF.rename(columns={'fully_funded': 'not_funded'})
# group 
project_TS_NFF["group"]=pd.qcut(project_TS_NFF['date_posted'].rank(method='first'), q=5,labels=False)
project_TS_NFF.groupby("group").agg({"date_posted":["min","max"]})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  project_attribute[binary_col] = project_attribute[binary_col].replace({'t': 1, 'f': 0})


Unnamed: 0_level_0,date_posted,date_posted
Unnamed: 0_level_1,min,max
group,Unnamed: 1_level_2,Unnamed: 2_level_2
0,2002-09-13,2008-12-21
1,2008-12-21,2010-12-02
2,2010-12-02,2012-03-11
3,2012-03-11,2013-09-02
4,2013-09-02,2014-09-12


In [9]:
project_TS_NFF.to_csv("project_time_series.csv")