In [None]:
# Import the data
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing 

# Models to use in our pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Preprocessing dependencies
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('data/valentine_gift.csv')
df.head()


Unnamed: 0,Gender,Age,Relationship_Status,Budget,Occasion,Preference,Relationship_Length,Personal_Interest,Past_Gift_Item,Past_Gift_Reaction,Recent_Purchase_1,Recent_Purchase_2,Recent_Purchase_3,Best_Gift
0,Male,56,Engaged,Low,Just Because,Adventurous,<6 months,Music,Tech Gadget,Loved it,Meal Kit Sub,Gym Membership,Magazine Sub,Fashion Accessory
1,Female,58,Single,Low,Anniversary,Surprise,6-12 months,Technology,Chocolates,Loved it,Phone Charger,Smart Speaker,Processor Chip,Fashion Accessory
2,Female,21,Single,Low,Valentine's Day,Romantic,5+ years,Cooking,Fashion Accessory,Loved it,Magazine Sub,Streaming Sub,Meal Kit Sub,Flowers
3,Other,60,Married,Very High,Anniversary,Sentimental,6-12 months,Music,Chocolates,Loved it,Meal Kit Sub,Gym Membership,Magazine Sub,Personalized Gift
4,Male,56,Engaged,Low,Just Because,Adventurous,<6 months,Music,Tech Gadget,Loved it,Meal Kit Sub,Gym Membership,Magazine Sub,Fashion Accessory


In [None]:
# merge two samilar value to one vlaue
def mergeData(df, column_name, value_one, value_two):
    df[column_name] = df[column_name].replace(value_one, value_two)
    
# encode column's value
def hot_encode(df):
       categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
       encoder = OneHotEncoder(sparse_output=False)
       one_hot_encoded = encoder.fit_transform(df[categorical_columns])

       one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
       df_encoded = pd.concat([df, one_hot_df], axis=1)
       df_encoded = df_encoded.drop(categorical_columns, axis=1)

       return df_encoded
    
def label_encode(df):
# label_encoder object knows  
# how to understand word labels. 
    label_encoder = preprocessing.LabelEncoder() 
    
    # Encode labels in column 'species'. 
    df['Best_Gift_Encoded']= label_encoder.fit_transform(df['Best_Gift']) 
    
    df['Best_Gift_Encoded'].unique() 
    print(df[['Best_Gift', 'Best_Gift_Encoded']].head())

    return df['Best_Gift_Encoded']

def clean_data(df):
    
    # drop useless columns
    df = df.drop(columns = ['Past_Gift_Reaction'])

    # split X and y
    # use hot encoding to encode some columns in x 
    X = df.drop(columns=['Best_Gift'])
    X = hot_encode (X)

    # use OrdinalEncoder to encode "Best_Gift" column and get y
    y = label_encode(df)

    
    return X, y

In [5]:
def r2_adj(X, y, model):
    """
    Calculates adjusted r-squared values

    Args:
    X: Independent variables, the data to fit
    y: dependent variable, the target data to try to predict
    model: The estimator or object to use to train the data

    Returns: adjusted r sqaured value accountign for number of predictors
    """
    r2 = model.score(X, y)
    n = X.shape[0]
    p = y.ndim


def model_generator(X, y):
    y = y.values.reshape(-1, 1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

    models = {
        "RF": RandomForestClassifier(),
        'DT': DecisionTreeClassifier(),
        'SVC': SVC(),
        'AB': AdaBoostClassifier()
    }


    for  name, model in models.items():
        pipeline = Pipeline([
            ("Scale", StandardScaler(with_mean=False)),
            (name, model)
        ])

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        print("name:", name, "score:", pipeline.score(X_test, y_test))
        # print(accuracy_score(y_test, y_pred))

    


In [6]:
X, y = clean_data(df)

           Best_Gift  Best_Gift_Encoded
0  Fashion Accessory                  1
1  Fashion Accessory                  1
2            Flowers                  2
3  Personalized Gift                  3
4  Fashion Accessory                  1


In [7]:
model_generator(X, y)

  return fit_method(estimator, *args, **kwargs)


name: RF score: 0.96
name: DT score: 0.968


  y = column_or_1d(y, warn=True)


name: SVC score: 0.956
name: AB score: 0.564


  y = column_or_1d(y, warn=True)


In [10]:
def getBestGiftForTeamNine():
    ourData = {'Gender': ['Male', 'Male','Male', 'Male', 'Female', 'Female'],
             'Age': [35, 20,38, 23, 32, 38], 
             'Relationship_Status': ['Single', 'Dating','Married', 'Single', 'Married','Married'], 
             'Budget': ['Medium', 'Low','Low', 'Very High', 'Low', 'High'], 
             'Occasion': ['Birthday', 'Just Because', 'Just Because', 'Just Because', 'Birthday','Just Because'], 
             'Preference': ['Practical', 'Practical', 'Romantic','Surprise','Surprise','Surprise'], 
             'Relationship_Length': ['<6 months', '<6 months', '5+ years','<6 months', '5+ years', '5+ years'], 
             'Personal_Interest': ['Technology', 'Music', 'Cooking','Music', 'Sports', 'Fashion'],
             'Past_Gift_Item': ['Chocolates', 'Tech Gadget', 'Flowers','Personalized Gift', 'Tech Gadget','Personalized Gift'],
             'Past_Gift_Reaction': ['Liked it', 'Loved it', 'Disliked it','Liked it', 'Loved it', 'Loved it'],
             'Recent_Purchase_1': ['Headphones', 'Streaming Sub', 'Meal Kit Sub','Phone Charger', 'Hat', 'Handbag'],
             'Recent_Purchase_2': ['Hat', 'Phone Charger', 'Streaming Sub','Hat', 'Smart Speaker', 'Necklace'],
             'Recent_Purchase_3': ['Streaming Sub', 'Headphones', 'Phone Charger','Streaming Sub', 'Headphones', 'Headphones']}
    
    names = ['Leslie', 'Leonard', 'Sophak', 'Gabe', 'Rakesh', 'Yujing']
    gifts = ['Chocolates', 'Fashion Accessory', 'Flowers', 'Personalized gift', 'Subscription Service', 'Tech Gadget']


    df = pd.DataFrame(ourData)
    X_our = hot_encode(df)

    model = DecisionTreeClassifier()

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)
    model.fit(X_train, y_train)


    missing_columns = [col for col in X_train.columns if col not in X_our.columns]
    missing_data = pd.DataFrame(0, index=X_our.index, columns=missing_columns)
    final_df = pd.concat([X_our, missing_data], axis=1)
    final_df = final_df.reindex(columns=X_train.columns)

    y_pred = model.predict(final_df)

    for index, name in enumerate(names):
        gift_num = y_pred[index]
        best_gift = gifts[gift_num]
        print(f'the best gift for {name} is {best_gift}')
    
    

In [11]:
getBestGiftForTeamNine()

the best gift for Leslie is Flowers
the best gift for Leonard is Chocolates
the best gift for Sophak is Tech Gadget
the best gift for Gabe is Subscription Service
the best gift for Rakesh is Fashion Accessory
the best gift for Yujing is Fashion Accessory
