In [1]:
import warnings
warnings.filterwarnings("ignore")
import wrangle as w
import functions as f

import pandas as pd
import numpy as np

#splits
from sklearn.model_selection import train_test_split

#visualization
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

#scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import precision_score, accuracy_score, recall_score, classification_report

#model
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression


In [2]:
df = pd.read_csv("dallas.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27571 entries, 0 to 27570
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Animal Id               27571 non-null  object
 1   Animal Type             27571 non-null  object
 2   Animal Breed            27536 non-null  object
 3   Kennel Number           27571 non-null  object
 4   Kennel Status           27571 non-null  object
 5   Activity Number         10644 non-null  object
 6   Activity Sequence       27571 non-null  int64 
 7   Source Id               27571 non-null  object
 8   Census Tract            17461 non-null  object
 9   Council District        17461 non-null  object
 10  Intake Type             27571 non-null  object
 11  Intake Subtype          27571 non-null  object
 12  Intake Total            27571 non-null  int64 
 13  Reason                  23469 non-null  object
 14  Staff Id                27571 non-null  object
 15  In

In [4]:
df.describe()

Unnamed: 0,Activity Sequence,Intake Total
count,27571.0,27571.0
mean,0.926626,1.0
std,0.855598,0.0
min,0.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,1.0
max,27.0,1.0


In [5]:
# lowercase everything
df = df.apply(lambda x: x.astype(str).str.lower())
df.columns = df.columns.str.lower()

In [6]:
# returns all dupes
duplicates = df[df['animal id'].duplicated()]
# duplicate ids -- that need to drop
dupe_list = list(duplicates['animal id'].unique())
# removed dupes --  shape after dropping dupes (124940, 23)
df = df[~df['animal id'].isin(dupe_list)]
# rename columns

In [7]:
# Filter 'species' to only return cats or dogs
df = df[df['animal type'].isin(['dog', 'cat'])]


In [8]:
df.columns

Index(['animal id', 'animal type', 'animal breed', 'kennel number',
       'kennel status', 'activity number', 'activity sequence', 'source id',
       'census tract', 'council district', 'intake type', 'intake subtype',
       'intake total', 'reason', 'staff id', 'intake date', 'intake time',
       'due out', 'intake condition', 'hold request', 'outcome type',
       'outcome subtype', 'outcome date', 'outcome time', 'receipt number',
       'impound number', 'service request number', 'outcome condition',
       'chip status', 'animal origin', 'additional information', 'month',
       'year'],
      dtype='object')

In [9]:
df["animal origin"].value_counts()

over the counter    7096
field               4827
nan                 4067
aggops               380
bite                 169
aggdd                  7
hart                   4
ops                    1
Name: animal origin, dtype: int64

In [10]:
df = df[["animal type", "animal breed", "council district", "intake type", "intake condition", "outcome type", "outcome condition", "chip status"]]

In [11]:
new_columns = {col: col.replace(' ', '_') for col in df.columns}
df.rename(columns=new_columns, inplace=True)

In [12]:
for col in df.columns:                                                                                                         
    df = df[df[col] != "as"]

In [13]:
for col in df.columns:                                                                                                         
    df = df[df[col] != "nan"]

In [14]:
df.council_district = df.council_district.astype("float")

In [15]:
for col in df.columns:                                                                                                         
    print(df[col].value_counts())

dog    8021
cat    1891
Name: animal_type, dtype: int64
mixed breed      5041
domestic sh      1713
pit bull          887
germ shepherd     473
labrador retr     267
                 ... 
bernese hound       1
irish setter        1
dachshund lh        1
bull terr min       1
belg tervuren       1
Name: animal_breed, Length: 123, dtype: int64
4.0     1457
6.0     1439
8.0     1359
7.0      964
5.0      948
1.0      938
3.0      854
2.0      569
9.0      379
10.0     269
14.0     244
13.0     194
11.0     177
12.0     107
0.0       14
Name: council_district, dtype: int64
stray              8185
owner surrender     783
confiscated         534
keepsafe            310
dispos req           59
treatment            32
transfer              5
foster                4
Name: intake_type, dtype: int64
app wnl     7444
app inj      949
app sick     922
critical     281
underage     167
deceased     139
fatal         10
Name: intake_condition, dtype: int64
adoption             3048
euthanized        

In [16]:
df.head()

Unnamed: 0,animal_type,animal_breed,council_district,intake_type,intake_condition,outcome_type,outcome_condition,chip_status
0,cat,domestic sh,11.0,stray,app inj,euthanized,critical,scan no chip
1,dog,mixed breed,5.0,stray,app wnl,adoption,app wnl,scan no chip
3,dog,mixed breed,9.0,stray,app wnl,adoption,app wnl,scan no chip
4,dog,chihuahua sh,9.0,stray,app sick,transfer,app sick,scan no chip
5,dog,mixed breed,6.0,stray,app wnl,adoption,app wnl,scan no chip


In [17]:
df.chip_status.value_counts()

scan no chip      7900
scan chip         1819
unable to scan     193
Name: chip_status, dtype: int64

In [18]:
df = df[df.chip_status != "unable to scan"]

In [19]:
df['is_dog'] = df['animal_type'].apply(lambda x: 1 if x.strip() == "dog" else 0)

In [20]:
df["is_chip"] = df['chip_status'].replace({"scan no chip": 0, "scan chip": 1})


In [21]:
# creates columns if animal is normal
df['is_normal'] = df['outcome_condition'].apply(lambda x: 1 if x.strip() == "app wnl" else 0)

In [22]:
def map_outcome_condition(outcome):
    return 1 if outcome in ['deceased', 'fatal'] else 0

# Create a new column 'outcome_binary' based on the condition
# df['outcome_binary'] = 
df['is_deceased'] = df['outcome_condition'].apply(lambda x: map_outcome_condition(x))


In [23]:
# creates columns if animal is normal
df['is_underage'] = df['outcome_condition'].apply(lambda x: 1 if x.strip() == "underage" else 0)

In [24]:
def map_outcome_sick(outcome):
    return 1 if outcome in ['app sick', 'critical', 'app inj'] else 0

# Create a new column 'outcome_binary' based on the condition
# df['outcome_binary'] = 
# df['is_deceased'] =
df["is_sick"] = df['outcome_condition'].apply(lambda x: map_outcome_sick(x))


In [25]:
df.animal_type.value_counts()

dog    7872
cat    1847
Name: animal_type, dtype: int64

In [26]:
# creates columns if animal is stary
df['is_stray'] = df['intake_type'].apply(lambda x: 1 if x.strip() == "stray" else 0)

In [27]:
# creates columns if animal is stary
df['is_owner_surrender'] = df['intake_type'].apply(lambda x: 1 if x.strip() == "owner surrender" else 0)

In [28]:
# creates columns if animal is stary
df['is_confiscated'] = df['intake_type'].apply(lambda x: 1 if x.strip() == "confiscated" else 0)

In [29]:
# creates columns if animal is stary
df['is_adopted'] = df['outcome_type'].apply(lambda x: 1 if x.strip() == "adoption" else 0)

In [31]:
df = df[['is_dog', 'is_chip', 'is_normal', 'is_deceased', 'is_underage',
       'is_sick', 'is_stray', 'is_owner_surrender', 'is_confiscated',
       'is_adopted']]

In [32]:
f.get_models

<function functions.get_models(X_train, y_train, X_validate, y_validate)>

In [33]:
def get_xy(model_df):
    '''
    This function generates X and y for train, validate, and test to use : X_train, y_train, X_validate, y_validate, X_test, y_test = get_xy()

    '''
    train, validate, test = w.split_data(model_df,'is_adopted')

    X_train = train.drop(['is_adopted'], axis=1)
    y_train = train.is_adopted
    X_validate = validate.drop(['is_adopted'], axis=1)
    y_validate = validate.is_adopted
    X_test = test.drop(['is_adopted'], axis=1)
    y_test = test.is_adopted
    return X_train,y_train,X_validate,y_validate,X_test,y_test

In [34]:
train, validate, test = w.split_data(df, "is_adopted")

train -> (5831, 10), 60.0%
validate -> (1944, 10),20.0%
test -> (1944, 10), 20.0%


In [35]:
# get x and y variables
x_train,y_train,x_validate,y_validate,x_test,y_test = get_xy(df)

train -> (5831, 10), 60.0%
validate -> (1944, 10),20.0%
test -> (1944, 10), 20.0%


In [36]:
f.get_models(x_train,y_train,x_validate,y_validate)

Unnamed: 0,model,set,accuracy
0,k_nearest_neighbors,train,0.749614
1,k_nearest_neighbors,validate,0.748457
2,logistic_regression,train,0.749614
3,logistic_regression,validate,0.748457
4,DecisionTreeClassifier,train,0.750986
5,DecisionTreeClassifier,validate,0.750514
6,random_forest,train,0.753044
7,random_forest,validate,0.741255
8,support_vector_machine,train,0.756474
9,support_vector_machine,validate,0.742798


In [37]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sqlalchemy import text, create_engine
import scipy.stats as stats

import warnings
warnings.filterwarnings("ignore")

np.random.seed(42)

def chi_stats(train, feature, target):
    '''
    This function runs a chi2 stats test on sex and outcome.
    It returns the contingency table and results in a pandas DataFrame.
    '''
    # Create a contingency table
    contingency_table = pd.crosstab(train[feature], train[target])

    # Perform the chi-square test
    chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)

    # Create a DataFrame for the contingency table
    contingency_sex = pd.DataFrame(contingency_table)

    # Create a DataFrame for the results
    results = pd.DataFrame({
        'Chi-square statistic': [chi2],
        'p-value': [p_value],
        'Degrees of freedom': [dof]
    })

    # Return the contingency table and results DataFrame
    return results

In [38]:
train

Unnamed: 0,is_dog,is_chip,is_normal,is_deceased,is_underage,is_sick,is_stray,is_owner_surrender,is_confiscated,is_adopted
1840,1,1,1,0,0,0,0,0,1,0
24387,1,0,1,0,0,0,1,0,0,1
9123,1,0,1,0,0,0,0,1,0,0
18034,1,0,1,0,0,0,0,1,0,0
1740,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2511,1,0,1,0,0,0,1,0,0,1
25016,1,0,1,0,0,0,1,0,0,0
26192,1,0,0,0,0,1,1,0,0,0
12820,0,1,0,0,0,1,1,0,0,0


In [39]:
chi_stats(train, feature= "is_chip", target= "is_adopted")

Unnamed: 0,Chi-square statistic,p-value,Degrees of freedom
0,102.087105,5.313328e-24,1


In [40]:
for col in train.columns:
    print(col)
    print(chi_stats(train, feature= col, target= "is_adopted"))

is_dog
   Chi-square statistic       p-value  Degrees of freedom
0             68.204982  1.473530e-16                   1
is_chip
   Chi-square statistic       p-value  Degrees of freedom
0            102.087105  5.313328e-24                   1
is_normal
   Chi-square statistic        p-value  Degrees of freedom
0           1176.232377  8.927083e-258                   1
is_deceased
   Chi-square statistic       p-value  Degrees of freedom
0             87.142713  1.009663e-20                   1
is_underage
   Chi-square statistic       p-value  Degrees of freedom
0            186.056674  2.307386e-42                   1
is_sick
   Chi-square statistic        p-value  Degrees of freedom
0            689.046655  7.204565e-152                   1
is_stray
   Chi-square statistic       p-value  Degrees of freedom
0            140.290385  2.299926e-32                   1
is_owner_surrender
   Chi-square statistic       p-value  Degrees of freedom
0             35.684954  2.319485e-09    

In [41]:
df.is_adopted.value_counts()

0    6674
1    3045
Name: is_adopted, dtype: int64

In [42]:
df.shape

(9719, 10)

In [None]:
68.66%