In [2]:
# importing libraries
import pandas            as pd                       # data science essentials
import matplotlib.pyplot as plt                      # data visualization
import seaborn           as sns                      # enhanced data viz
from sklearn.model_selection import train_test_split # train-test split
from sklearn.linear_model import LogisticRegression  # logistic regression
import statsmodels.formula.api as smf                # logistic regression
from sklearn.metrics import confusion_matrix         # confusion matrix
from sklearn.metrics import roc_auc_score            # auc score
from sklearn.neighbors import KNeighborsClassifier   # KNN for classification
from sklearn.neighbors import KNeighborsRegressor    # KNN for regression
from sklearn.preprocessing import StandardScaler     # standard scaler
from sklearn.tree import DecisionTreeClassifier      # classification trees
from sklearn.tree import plot_tree                   # tree plots
import random as rand
import gender_guesser.detector as gender


# loading data
file = "./GOT_character_predictions.xlsx"
got = pd.read_excel(io = file)

# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)


# displaying the head of the dataset
got.head(n = 5)

Unnamed: 0,S.No,name,title,culture,dateOfBirth,mother,father,heir,house,spouse,book1_A_Game_Of_Thrones,book2_A_Clash_Of_Kings,book3_A_Storm_Of_Swords,book4_A_Feast_For_Crows,book5_A_Dance_with_Dragons,isAliveMother,isAliveFather,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,popularity,isAlive
0,1,Viserys II Targaryen,,,,Rhaenyra Targaryen,Daemon Targaryen,Aegon IV Targaryen,,,0,0,0,0,0,1.0,0.0,0.0,,0,0,,11,0.605351,0
1,2,Walder Frey,Lord of the Crossing,Rivermen,208.0,,,,House Frey,Perra Royce,1,1,1,1,1,,,,1.0,1,1,97.0,1,0.896321,1
2,3,Addison Hill,Ser,,,,,,House Swyft,,0,0,0,1,0,,,,,0,1,,0,0.267559,1
3,4,Aemma Arryn,Queen,,82.0,,,,House Arryn,Viserys I Targaryen,0,0,0,0,0,,,,0.0,1,1,23.0,0,0.183946,0
4,5,Sylva Santagar,Greenstone,Dornish,276.0,,,,House Santagar,Eldon Estermont,0,0,0,1,0,,,,1.0,1,1,29.0,0,0.043478,1


In [2]:
#getting the shape
got.shape

(1946, 25)

In [3]:
#data info and type
got.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1946 entries, 0 to 1945
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   S.No                        1946 non-null   int64  
 1   name                        1946 non-null   object 
 2   title                       938 non-null    object 
 3   culture                     677 non-null    object 
 4   dateOfBirth                 433 non-null    float64
 5   mother                      21 non-null     object 
 6   father                      26 non-null     object 
 7   heir                        23 non-null     object 
 8   house                       1519 non-null   object 
 9   spouse                      276 non-null    object 
 10  book1_A_Game_Of_Thrones     1946 non-null   int64  
 11  book2_A_Clash_Of_Kings      1946 non-null   int64  
 12  book3_A_Storm_Of_Swords     1946 non-null   int64  
 13  book4_A_Feast_For_Crows     1946 

In [4]:
#missing values
got.isnull().any()

S.No                          False
name                          False
title                          True
culture                        True
dateOfBirth                    True
mother                         True
father                         True
heir                           True
house                          True
spouse                         True
book1_A_Game_Of_Thrones       False
book2_A_Clash_Of_Kings        False
book3_A_Storm_Of_Swords       False
book4_A_Feast_For_Crows       False
book5_A_Dance_with_Dragons    False
isAliveMother                  True
isAliveFather                  True
isAliveHeir                    True
isAliveSpouse                  True
isMarried                     False
isNoble                       False
age                            True
numDeadRelations              False
popularity                    False
isAlive                       False
dtype: bool

In [5]:
#sum of missing values
got.isnull().sum(axis = 0)

S.No                             0
name                             0
title                         1008
culture                       1269
dateOfBirth                   1513
mother                        1925
father                        1920
heir                          1923
house                          427
spouse                        1670
book1_A_Game_Of_Thrones          0
book2_A_Clash_Of_Kings           0
book3_A_Storm_Of_Swords          0
book4_A_Feast_For_Crows          0
book5_A_Dance_with_Dragons       0
isAliveMother                 1925
isAliveFather                 1920
isAliveHeir                   1923
isAliveSpouse                 1670
isMarried                        0
isNoble                          0
age                           1513
numDeadRelations                 0
popularity                       0
isAlive                          0
dtype: int64

In [3]:
#imputing median
age_median = got['age'].median()
dateOfBirth_median = got['dateOfBirth'].median()

#imputing unknowns
got['title'] = got['title'].fillna('unknown')
got['culture'] = got['culture'].fillna('unknown')
got['mother'] = got['mother'].fillna('unknown')
got['father'] = got['father'].fillna('unknown')
got['heir'] = got['heir'].fillna('unknown')
got['house'] = got['house'].fillna('unknown')
got['isAliveMother'] = got['title'].fillna(0)
got['isAliveFather'] = got['title'].fillna(0)
got['isAliveHeir'] = got['isAliveHeir'].fillna(0)
got['isAliveSpouse'] = got['isAliveSpouse'].fillna(0)
got['spouse'] = got['spouse'].fillna('unknown')

In [20]:
#check if there is any missing info
got.isnull().any()

S.No                          False
name                          False
title                         False
culture                       False
dateOfBirth                   False
mother                        False
father                        False
heir                          False
house                         False
spouse                        False
book1_A_Game_Of_Thrones       False
book2_A_Clash_Of_Kings        False
book3_A_Storm_Of_Swords       False
book4_A_Feast_For_Crows       False
book5_A_Dance_with_Dragons    False
isAliveMother                 False
isAliveFather                 False
isAliveHeir                   False
isAliveSpouse                 False
isMarried                     False
isNoble                       False
age                           False
numDeadRelations              False
popularity                    False
isAlive                       False
dtype: bool

In [25]:
# setting random seed
rand.seed(a = 327)
# STEP 1: splitting names

# placeholder list
placeholder_lst = []

# looping over each name
for index, col in got.iterrows():
   
    # splitting email domain at '@'
    split_name = got.loc[index, 'name'].split(sep = ' ')
   
    # appending placeholder_lst with the results
    placeholder_lst.append(split_name)

names = pd.DataFrame(placeholder_lst)


# displaying the results
names

Unnamed: 0,0,1,2,3,4,5
0,Viserys,II,Targaryen,,,
1,Walder,Frey,,,,
2,Addison,Hill,,,,
3,Aemma,Arryn,,,,
4,Sylva,Santagar,,,,
...,...,...,...,...,...,...
1941,Luwin,,,,,
1942,Reek,,,,,
1943,Symeon,Star-Eyes,,,,
1944,Coldhands,,,,,


In [28]:

#concatenating with original DataFrame

first_name = names[0]
print(first_name)
# guessing gender based on (given) name

# placeholder list
placeholder_lst = []


# looping to guess gender
for name in got['First_Name']:
    guess = gender.Detector().get_gender(name)
    print(guess)
    placeholder_lst.append(guess)


# converting list into a series
got['gender_guess'] = pd.Series(placeholder_lst)


# checking results
got.head(n = 5)

0         Viserys
1          Walder
2         Addison
3           Aemma
4           Sylva
          ...    
1941        Luwin
1942         Reek
1943       Symeon
1944    Coldhands
1945        Tytos
Name: 0, Length: 1946, dtype: object
unknown
unknown
andy
unknown
female
unknown
unknown
unknown
male
male
mostly_male
mostly_male
mostly_male
mostly_male
mostly_male
mostly_male
unknown
male
unknown
unknown
male
male
female
unknown
unknown
female
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
mostly_female
unknown
unknown
unknown
unknown
unknown
female
unknown
unknown
male
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
male
unknown
unknown
unknown
unknown
unknown
unknown
female
unknown
unknown
unknown
male
male
andy
andy
unknown
andy
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
female
male
male
unknown
male
male
male
male
male
m

unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
male
male
male
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
male
unknown
male
unknown
unknown
unknown
unknown
unknown
male
unknown
unknown
unknown
female
unknown
male
male
male
male
unknown
unknown
unknown
unknown
unknown
unknown
unknown
male
male
male
male
male
male
male
male
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
female
unknown
unknown
unknown
unknown
unknown
unknown
unknown
female
female
male
unknown
unknown
unknown
unknown
unknown
unknown
male
male
female
unknown
female
male
unknown
unknown
unknown
unknown
unknown
unknown
male
male
male
unknown
unknown
unknown
unknown
unknown
unknown
unknown
male
unknown
male
unknown
unknown
male
unknown
female
unknown
unknown
unknown
mostly_male
male
unknown
unknown
male
male
male
unknown
female
unkn

Unnamed: 0,S.No,name,First_Name,title,culture,dateOfBirth,mother,father,heir,house,spouse,book1_A_Game_Of_Thrones,book2_A_Clash_Of_Kings,book3_A_Storm_Of_Swords,book4_A_Feast_For_Crows,book5_A_Dance_with_Dragons,isAliveMother,isAliveFather,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,popularity,isAlive,gender_guess
0,1,Viserys II Targaryen,Viserys,,,,Rhaenyra Targaryen,Daemon Targaryen,Aegon IV Targaryen,,,0,0,0,0,0,1.0,0.0,0.0,,0,0,,11,0.605351,0,unknown
1,2,Walder Frey,Walder,Lord of the Crossing,Rivermen,208.0,,,,House Frey,Perra Royce,1,1,1,1,1,,,,1.0,1,1,97.0,1,0.896321,1,unknown
2,3,Addison Hill,Addison,Ser,,,,,,House Swyft,,0,0,0,1,0,,,,,0,1,,0,0.267559,1,andy
3,4,Aemma Arryn,Aemma,Queen,,82.0,,,,House Arryn,Viserys I Targaryen,0,0,0,0,0,,,,0.0,1,1,23.0,0,0.183946,0,unknown
4,5,Sylva Santagar,Sylva,Greenstone,Dornish,276.0,,,,House Santagar,Eldon Estermont,0,0,0,1,0,,,,1.0,1,1,29.0,0,0.043478,1,female


In [4]:
#gender guesser
one_hot_gender_guess = pd.get_dummies(got['gender_guess'])
got = got.join([one_hot_gender_guess])

KeyError: 'gender_guess'

In [5]:
#dropping missing values
got = got.drop(['female'], axis = 1)
got = got.drop(['mostly_male','andy','unknown'], axis = 1)
got = got.drop('female', axis = 1)
got = got.drop('mostly_female', axis = 1)
got = got.drop('gender_guess' , axis = 1)
got.head()


KeyError: "['female'] not found in axis"

In [32]:
#visualize the new columns
got.columns

Index(['S.No', 'name', 'First_Name', 'title', 'culture', 'dateOfBirth', 'mother', 'father', 'heir', 'house', 'spouse', 'book1_A_Game_Of_Thrones', 'book2_A_Clash_Of_Kings', 'book3_A_Storm_Of_Swords', 'book4_A_Feast_For_Crows', 'book5_A_Dance_with_Dragons', 'isAliveMother', 'isAliveFather', 'isAliveHeir', 'isAliveSpouse', 'isMarried', 'isNoble', 'age', 'numDeadRelations', 'popularity', 'isAlive', 'gender_guess', 'male', 'mostly_female'], dtype='object')

In [7]:
# optimal_neighbors
########################################
def optimal_neighbors(x_data,
                      y_data,
                      standardize = True,
                      pct_test=0.1,
                      seed=219,
                      response_type='reg',
                      max_neighbors=20,
                      show_viz=True):
    """
Exhaustively compute training and testing results for KNN across
[1, max_neighbors]. Outputs the maximum test score and (by default) a
visualization of the results.
PARAMETERS
----------
x_data        : explanatory variable data
y_data        : response variable
standardize   : whether or not to standardize the x data, default True
pct_test      : test size for training and validation from (0,1), default 0.25
seed          : random seed to be used in algorithm, default 219
response_type : type of neighbors algorithm to use, default 'reg'
    Use 'reg' for regression (KNeighborsRegressor)
    Use 'class' for classification (KNeighborsClassifier)
max_neighbors : maximum number of neighbors in exhaustive search, default 20
show_viz      : display or surpress k-neigbors visualization, default True
"""    
    
    
    if standardize == True:
        # optionally standardizing x_data
        scaler             = StandardScaler()
        scaler.fit(x_data)
        x_scaled           = scaler.transform(x_data)
        x_scaled_df        = pd.DataFrame(x_scaled)
        x_data             = x_scaled_df



    # train-test split
    x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                        y_data,
                                                        test_size = pct_test,
                                                        random_state = seed)


    # creating lists for training set accuracy and test set accuracy
    training_accuracy = []
    test_accuracy = []
    
    
    # setting neighbor range
    neighbors_settings = range(1, max_neighbors + 1)


    for n_neighbors in neighbors_settings:
        # building the model based on response variable type
        if response_type == 'reg':
            clf = KNeighborsRegressor(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)
            
        elif response_type == 'class':
            clf = KNeighborsClassifier(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)            
            
        else:
            print("Error: response_type must be 'reg' or 'class'")
        
        
        # recording the training set accuracy
        training_accuracy.append(clf.score(x_train, y_train))
    
        # recording the generalization accuracy
        test_accuracy.append(clf.score(x_test, y_test))


    # optionally displaying visualization
    if show_viz == True:
        # plotting the visualization
        fig, ax = plt.subplots(figsize=(12,8))
        plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")
        plt.plot(neighbors_settings, test_accuracy, label = "test accuracy")
        plt.ylabel("Accuracy")
        plt.xlabel("n_neighbors")
        plt.legend()
        plt.show()
    
    
    # returning optimal number of neighbors
    print(f"The optimal number of neighbors is: {test_accuracy.index(max(test_accuracy))+1}")
    return test_accuracy.index(max(test_accuracy))+1


########################################
# visual_cm
########################################
def visual_cm(true_y, pred_y, labels = None):
    """
Creates a visualization of a confusion matrix.

PARAMETERS
----------
true_y : true values for the response variable
pred_y : predicted values for the response variable
labels : , default None
    """
    # visualizing the confusion matrix

    # setting labels
    lbls = labels
    

    # declaring a confusion matrix object
    cm = confusion_matrix(y_true = true_y,
                          y_pred = pred_y)


    # heatmap
    sns.heatmap(cm,
                annot       = True,
                xticklabels = lbls,
                yticklabels = lbls,
                cmap        = 'Blues',
                fmt         = 'g')


    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix of the Classifier')
    plt.show()

In [8]:
#correlation analysis
df_corr = got.corr(method = 'pearson').round(decimals = 2)

df_corr['isAlive'].sort_values(ascending = False)

isAlive                       1.00
book4_A_Feast_For_Crows       0.27
age                           0.09
book5_A_Dance_with_Dragons    0.03
book3_A_Storm_Of_Swords       0.01
isAliveSpouse                -0.01
isNoble                      -0.04
isMarried                    -0.05
book2_A_Clash_Of_Kings       -0.07
isAliveHeir                  -0.08
dateOfBirth                  -0.09
S.No                         -0.13
book1_A_Game_Of_Thrones      -0.15
popularity                   -0.18
numDeadRelations             -0.19
Name: isAlive, dtype: float64

In [9]:
#stratifying the response variable
got.loc[ : ,'isAlive'].value_counts(normalize = True).round(decimals = 2)

1    0.75
0    0.25
Name: isAlive, dtype: float64

In [10]:
# declaring explanatory variables
got_data = got.drop('isAlive', axis = 1)


# declaring response variable
got_target = got.loc[ : , 'isAlive']  

In [12]:
# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
            got_data,
            got_target,
            test_size    = 0.1,
            random_state = 219,
            stratify     = got_target) # preserving balance


# merging training data for statsmodels
got_train = pd.concat([x_train, y_train], axis = 1)

In [40]:
print(f"""

Response Variable Proportions (Training Set)
--------------------------------------------
{y_train.value_counts(normalize = True).round(decimals = 2)}



Response Variable Proportions (Testing Set)
--------------------------------------------
{y_test.value_counts(normalize = True).round(decimals = 2)}
""")



Response Variable Proportions (Training Set)
--------------------------------------------
1    0.75
0    0.25
Name: isAlive, dtype: float64



Response Variable Proportions (Testing Set)
--------------------------------------------
1    0.75
0    0.25
Name: isAlive, dtype: float64



In [13]:
# instantiating a logistic regression model object
logistic_small = smf.logit(formula   = """isAlive ~ book4_A_Feast_For_Crows""",
                           data = got_train)


# FITTING the model object
results_logistic = logistic_small.fit()


# checking the results SUMMARY
results_logistic.summary2()

Optimization terminated successfully.
         Current function value: 0.534562
         Iterations 6


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.057
Dependent Variable:,isAlive,AIC:,1876.0368
Date:,2021-12-04 14:48,BIC:,1886.9727
No. Observations:,1751,Log-Likelihood:,-936.02
Df Model:,1,LL-Null:,-992.53
Df Residuals:,1749,LLR p-value:,2.1240999999999998e-26
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.4601,0.0765,6.0172,0.0000,0.3102,0.6100
book4_A_Feast_For_Crows,1.1895,0.1141,10.4212,0.0000,0.9658,1.4132


In [14]:
for val in got_data:
    print(f" {val} + ")

 S.No + 
 name + 
 title + 
 culture + 
 dateOfBirth + 
 mother + 
 father + 
 heir + 
 house + 
 spouse + 
 book1_A_Game_Of_Thrones + 
 book2_A_Clash_Of_Kings + 
 book3_A_Storm_Of_Swords + 
 book4_A_Feast_For_Crows + 
 book5_A_Dance_with_Dragons + 
 isAliveMother + 
 isAliveFather + 
 isAliveHeir + 
 isAliveSpouse + 
 isMarried + 
 isNoble + 
 age + 
 numDeadRelations + 
 popularity + 


In [46]:
# instantiating a logistic regression model object
logistic_full = smf.logit(formula = """ isAlive ~                                                                                                                             
                                        popularity +
                                        numDeadRelations +
                                        book3_A_Storm_Of_Swords +
                                        numDeadRelations +
                                        book2_A_Clash_Of_Kings +
                                        book1_A_Game_Of_Thrones +
                                        dateOfBirth + 
                                        book4_A_Feast_For_Crows """,
                                        data    = got_train)


# fitting the model object
results_full = logistic_full.fit()


# checking the results SUMMARY
results_full.summary2()

Optimization terminated successfully.
         Current function value: 0.505146
         Iterations 9


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.235
Dependent Variable:,isAlive,AIC:,394.8592
Date:,2021-12-04 17:13,BIC:,426.2746
No. Observations:,375,Log-Likelihood:,-189.43
Df Model:,7,LL-Null:,-247.76
Df Residuals:,367,LLR p-value:,3.7785e-22
Converged:,1.0000,Scale:,1.0
No. Iterations:,9.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-1.1557,0.2569,-4.4985,0.0000,-1.6592,-0.6522
popularity,-0.2612,0.6615,-0.3949,0.6929,-1.5577,1.0353
numDeadRelations,-0.1135,0.0625,-1.8169,0.0692,-0.2359,0.0089
book3_A_Storm_Of_Swords,0.7225,0.4105,1.7601,0.0784,-0.0820,1.5271
book2_A_Clash_Of_Kings,-0.1234,0.4334,-0.2847,0.7759,-0.9728,0.7260
book1_A_Game_Of_Thrones,-0.8076,0.3711,-2.1764,0.0295,-1.5349,-0.0803
dateOfBirth,-0.0000,0.0000,-0.5063,0.6127,-0.0001,0.0000
book4_A_Feast_For_Crows,2.4379,0.3559,6.8493,0.0000,1.7403,3.1355


In [41]:
# instantiating a logistic regression model object
logistic_full = smf.logit(formula = """ isAlive ~                                                                                                                             
                                        popularity +
                                        book3_A_Storm_Of_Swords +
                                        numDeadRelations +
                                        book2_A_Clash_Of_Kings +
                                        book1_A_Game_Of_Thrones +
                                        book4_A_Feast_For_Crows """,
                                        data    = got_train)


# fitting the model object
results_full = logistic_full.fit()


# checking the results SUMMARY
results_full.summary2()

Optimization terminated successfully.
         Current function value: 0.503360
         Iterations 6


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.112
Dependent Variable:,isAlive,AIC:,1776.7662
Date:,2021-12-04 17:05,BIC:,1815.0418
No. Observations:,1751,Log-Likelihood:,-881.38
Df Model:,6,LL-Null:,-992.53
Df Residuals:,1744,LLR p-value:,3.3603e-45
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.8271,0.0907,9.1163,0.0000,0.6493,1.0050
popularity,-1.5107,0.4374,-3.4538,0.0006,-2.3681,-0.6534
book3_A_Storm_Of_Swords,-0.2603,0.1412,-1.8430,0.0653,-0.5371,0.0165
numDeadRelations,-0.1401,0.0500,-2.8044,0.0050,-0.2381,-0.0422
book2_A_Clash_Of_Kings,-0.3228,0.1382,-2.3367,0.0195,-0.5936,-0.0520
book1_A_Game_Of_Thrones,-0.4611,0.1539,-2.9969,0.0027,-0.7627,-0.1595
book4_A_Feast_For_Crows,1.5607,0.1382,11.2890,0.0000,1.2897,1.8316


In [47]:
# instantiating a logistic regression model object
logistic_full = smf.logit(formula = """ isAlive ~                                                                                                                             
                                        popularity +
                                        numDeadRelations +
                                        book2_A_Clash_Of_Kings +
                                        book1_A_Game_Of_Thrones +
                                        book4_A_Feast_For_Crows """,
                                        data    = got_train)


# fitting the model object
results_full = logistic_full.fit()


# checking the results SUMMARY
results_full.summary2()

Optimization terminated successfully.
         Current function value: 0.504329
         Iterations 6


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.11
Dependent Variable:,isAlive,AIC:,1778.1616
Date:,2021-12-04 17:15,BIC:,1810.9692
No. Observations:,1751,Log-Likelihood:,-883.08
Df Model:,5,LL-Null:,-992.53
Df Residuals:,1745,LLR p-value:,2.5481000000000002e-45
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.7849,0.0875,8.9714,0.0000,0.6134,0.9564
popularity,-1.5634,0.4377,-3.5716,0.0004,-2.4214,-0.7055
numDeadRelations,-0.1378,0.0501,-2.7510,0.0059,-0.2360,-0.0396
book2_A_Clash_Of_Kings,-0.3911,0.1331,-2.9383,0.0033,-0.6519,-0.1302
book1_A_Game_Of_Thrones,-0.4772,0.1538,-3.1025,0.0019,-0.7786,-0.1757
book4_A_Feast_For_Crows,1.4641,0.1264,11.5812,0.0000,1.2163,1.7119


In [55]:
# creating a dictionary to store candidate models

candidate_dict = {

 # full model
 'logit_full'   : ['popularity' ,'numDeadRelations' , 'book3_A_Storm_Of_Swords'
                    'numDeadRelations' , 'book2_A_Clash_Of_Kings' , 'book1_A_Game_Of_Thrones'
                   'dateOfBirth' , 'book4_A_Feast_For_Crows']
 
# significant variables only (set 1)
'logit_sig' : ['popularity' , 'book3_A_Storm_Of_Swords' , 'numDeadRelations' ,
              'book2_A_Clash_Of_Kings', 'book1_A_Game_Of_Thrones' , 
              'book4_A_Feast_For_Crows']    
    
    

# significant variables only (set 2)
 'logit_sig_2'  : ['popularity' , 'numDeadRelations' , 'book2_A_Clash_Of_Kings', 'book1_A_Game_Of_Thrones'
                    'book4_A_Feast_For_Crows']
}





SyntaxError: invalid syntax (Temp/ipykernel_6956/3407628966.py, line 11)

In [54]:
# train/test split with the full model
got_data   =  got.loc[ : , candidate_dict['logit_sig']]
got_target =  got.loc[ : , 'isAlive']


# this is the exact code we were using before
x_train, x_test, y_train, y_test = train_test_split(
            got_data,
            got_target,
            random_state = 219,
            test_size    = 0.1,
            stratify     = got_target)


# INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs',
                            C = 1,
                            random_state = 219)


# FITTING the training data
logreg_fit = logreg.fit(x_train, y_train)


# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(x_test)


# SCORING the results
print('LogReg Training ACCURACY:', logreg_fit.score(x_train, y_train).round(4))
print('LogReg Testing  ACCURACY:', logreg_fit.score(x_test, y_test).round(4))

# saving scoring data for future use
logreg_train_score = logreg_fit.score(x_train, y_train).round(4) # accuracy
logreg_test_score  = logreg_fit.score(x_test, y_test).round(4)   # accuracy


# displaying and saving the gap between training and testing
print('LogReg Train-Test Gap   :', abs(logreg_train_score - logreg_test_score).round(4))
logreg_test_gap = abs(logreg_train_score - logreg_test_score).round(4)

NameError: name 'candidate_dict' is not defined