# Modeling notes

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier

#ignore minor warnings
import warnings
warnings.filterwarnings("ignore")

import wrangle
import explore
import modelling

RAND_SEED = 357

In [2]:
# df = wrangle.make_pet_dataframe()
# train, validate, test = wrangle.split_data(df)

In [3]:
def make_feature_column(df):
    df['length_of_stay'] = (df.outcome_date-df.intake_date).dt.days
    return df

#make_feature_column(train)


In [4]:
def drop_columns_for_model(df):
    return df.drop(columns = ['outcome_date', 'name', 'sex_upon_outcome', 'sex_upon_intake', 'outcome_type', 'outcome_subtype', 'found_location'])

#drop_columns_for_model(train)

In [5]:
def make_X_and_y(df):
    '''Makes a X and y sets'''
    #drop relevant columns
    X_df = df.drop(columns = ['target_outcome'])
    #make y_Train
    y_df = df[['animal_id', 'target_outcome']]
    return X_df, y_df

#X_train, y_train = make_X_and_y(train)

In [6]:
def make_dummies(df):
    '''creates all catagorical columns into encoded columns'''
    #get all catagorical columns
    cat_cols = list(df.select_dtypes('object').iloc[:,1:].columns)
    # make dummy columns
    dummy_df = pd.get_dummies(df[cat_cols], dummy_na = False, drop_first = True)
    df = pd.concat([df, dummy_df], axis = 1)
    return df

# train_dummy = make_dummies(train)

In [7]:
df = wrangle.make_pet_dataframe()
train, validate, test = wrangle.split_data(df)
compare_models = []
#make a baseline model
#most pets are adopted. baseline model is pet gets adopted
baseline_prediction = train.target_outcome.value_counts().idxmax() #get most common value
baseline_model = pd.Series([baseline_prediction for x in range(train.shape[0])]) #make a dataframe with the predictions

#save accuracy in a variable and product e classification report
baseline_accuracy = metrics.classification_report(train.target_outcome, baseline_model, zero_division=True, output_dict=True)['accuracy']

Returning saved csv files.


In [8]:
baseline_accuracy

0.4347103811434303

In [9]:
model_df = modelling.models_mass(train, validate, baseline_accuracy)

Finished Logistic Regression with solver sagasn-cgs 45050

In [10]:
model_df['difference'] = model_df['train_accuracy'] - model_df['validate_accuracy']

In [11]:
model_df.sort_values(['validate_accuracy', 'difference'], ascending= False)

Unnamed: 0,model,attributes,train_accuracy,validate_accuracy,better_than_baseline,difference
18,Decision Tree Classifier,max_depth=11,0.777167,0.750079,True,0.027088
16,Decision Tree Classifier,max_depth=10,0.768412,0.749803,True,0.018609
14,Decision Tree Classifier,max_depth=9,0.759607,0.748069,True,0.011537
20,Decision Tree Classifier,max_depth=12,0.786951,0.747124,True,0.039827
22,Decision Tree Classifier,max_depth=13,0.797392,0.746454,True,0.050938
...,...,...,...,...,...,...
197,Extra Trees Model,leafs = 4 : depth = 2 : trees = 250,0.468658,0.468085,True,0.000573
55,Extra Trees Model,leafs = 1 : depth = 2 : trees = 300,0.467343,0.466312,True,0.001030
103,Extra Trees Model,leafs = 2 : depth = 2 : trees = 300,0.467343,0.466312,True,0.001030
151,Extra Trees Model,leafs = 3 : depth = 2 : trees = 300,0.467343,0.466312,True,0.001030


In [16]:
test_dict = {
    'key1':'value1',
    'key2':'value2',
    'key3':'value3'
}
for item in test_dict:
    print(f"key {item}, value {test_dict[item]}")
    print(item)

key key1, value value1
key1
key key2, value value2
key2
key key3, value value3
key3
