In [29]:
# Solving Classification problems using Logistic Regression

# getting the Titanic dataset
import pandas as pd
from sklearn import linear_model
from sklearn import preprocessing

# read the data
df = pd.read_csv(r'C:\Users\maria\Downloads\Project Data\titanic\train.csv') 
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [31]:
# drop the columns that are not useful to us
df = df.drop('PassengerId', axis=1) 
# axis=1 means column
df = df.drop('Name',        axis=1)
df = df.drop('Ticket',      axis=1)
df = df.drop('Cabin',       axis=1)

In [None]:
# drop rows with missing values
df = df.dropna()               # drop all rows 
                               # with NaN
df = df.reset_index(drop=True) # re-index the 
                               # dataframe
print(df.head(10))

In [None]:
# Encoding the Non-Numeric Fields

# initialize label encoder
label_encoder = preprocessing.LabelEncoder()

# convert Sex and Embarked features to numeric
sex_encoded = label_encoder.fit_transform(df["Sex"])
print(sex_encoded)
# 0 = female
# 1 = male
df['Sex'] = sex_encoded
embarked_encoded = label_encoder.fit_transform(df["Embarked"])
print(embarked_encoded)
# 0 = C
# 1 = Q
# 2 = S
df['Embarked'] = embarked_encoded

print(df.head())

In [None]:
# Making Fields Categorical
df["Pclass"]   = pd.Categorical(df["Pclass"])
df["Sex"]      = pd.Categorical(df["Sex"])
df["Embarked"] = pd.Categorical(df["Embarked"])
df["Survived"] = pd.Categorical(df["Survived"])
print(df.dtypes)    # examine the datatypes 
                    # for each feature

In [None]:
# Splitting the Dataset into Train and Test Sets

# use all columns except Survived as features for training
features = df.drop('Survived', axis=1)

# the label is Survived
label = df['Survived']

from sklearn.model_selection import train_test_split

# split the dataset into train and test sets
train_features,test_features, train_label,test_label = train_test_split(
        features,
        label,
        test_size = 0.25, # split ratio
        random_state = 1, # Set random seed 
        stratify = df["Survived"])

# training set
print(train_features.head())
print(train_label)

In [None]:
# Test set for validation
print(test_features.head())
print(test_label)

In [None]:
# Training the Model

# initialize logistic regression model
log_regress = linear_model.LogisticRegression()

# Train the model
log_regress.fit(X = train_features,
                y = train_label)

# check trained model intercept
print(log_regress.intercept_)

# check trained model coefficients
print(log_regress.coef_)

In [None]:
# Making predictions
preds = log_regress.predict(X=test_features)
print(preds)

# Predict the probablities
pred_probs = log_regress.predict_proba(X=test_features) 
print(pred_probs)

In [None]:
# Displaying the Metrics

# Generate table of predictions vs actual
print(pd.crosstab(preds, test_label))

# get the accuracy of the prediction
log_regress.score(X = test_features , y = test_label)

In [None]:
from sklearn import metrics
# view the confusion matrix
metrics.confusion_matrix(
    y_true = test_label,    # True labels
    y_pred = preds)         # Predicted labels

# View summary of common classification metrics
print(metrics.classification_report(
      y_true = test_label,
      y_pred = preds))