In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from matplotlib import pyplot as plt 
import seaborn as sns
sns.set(color_codes = True)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import make_pipeline

from sklearn.metrics import accuracy_score

## Lets first check how does our simple multivaraiate logistic regression performs in comparision to scikitlearn logistic regression.

In [None]:
# initialise some random values. 

X  = np.random.rand(15,5)
Y = np.random.randint(2, size=(15,1))

# We dont need to standarise these values as they np.random.rand intitialises uniform distribution between 0 & 1. 

In [None]:
# logistic regression using normal equation.

X  = np.concatenate((np.ones((15,1)),X), axis=1)    # put a constant feature which will capture constant coef. 
theta = np.dot(np.dot(np.linalg.inv(np.dot(X.T,X )), X.T),Y)  #calculating gradient using normal equation. 
z =    np.dot(X,theta) #dim 15*6 , 6*1 == 15*1
hx = 1/(1+ np.exp(-z))


In [None]:
cut_of = np.linspace(0, 1, 9)

for i in cut_of:
    a = hx > i
    predicted_value = a*1
    accuracy = accuracy_score(predicted_value,Y)
    print(f"accuracy for cutoff value: {i} is {accuracy}")

In [None]:
logr = LogisticRegression()
logr.fit(X,Y.ravel())
pp = logr.predict(X)
accuracy_score(pp,Y)

# Conclusion

## Our implementaion of LogR is as good as Scikit learn. So depending upon your prefrence you can opt whatever is suitable. 

# Lets implement LogR on Titanic dataset. 

In [None]:
X = pd.read_csv("/kaggle/input/titanicdataset-traincsv/train.csv")

In [None]:
X.describe()

### There are missin values in age. 

In [None]:
round(np.mean(X.Age))

In [None]:
# we will add randomly numbers around mean to fill na values . 

np.random.randint(26,34, size=(sum(X.Age.isnull()*1)))

In [None]:
m = X['Age'].isna()

X.loc[m, 'Age'] = np.random.randint(26,34, size=(sum(X.Age.isnull()*1)))

In [None]:
X

In [None]:
## Too many nan values. Lets see what does our fature importance tells us!!

sum(X.Cabin.isnull()*1)

In [None]:
X.Cabin = X.Cabin.fillna("nan")
X.Cabin = X.Cabin.str.replace('\d+', '')
X.Cabin = X.Cabin.str[0]

In [None]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

for column in ['Sex','Cabin','Embarked']:
    X[column] = labelencoder.fit_transform(X[column])
    
X.head()

In [None]:
X.drop(["PassengerId","Name","Ticket"],1,inplace=True)

# Feature Selection

In [None]:
y = X.Survived
X = X.drop("Survived",1)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns = X.columns)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=10000)
model.fit(X,y)
importance = model.coef_
feat_importances = pd.Series(importance.ravel(), index=X.columns)
feat_importances.plot(kind='barh')
plt.show()



In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
bestfeatures = SelectKBest(f_classif, k=4)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(8,'Score'))  #print 10 best features

In [None]:
## check using p.values

from sklearn import datasets, linear_model
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from scipy import stats

X2 = sm.add_constant(X)

est = sm.OLS(y.values, X.values)
est2 = est.fit()
print(est2.summary())

In [None]:
# Dropping unwanted features. 

X.drop("SibSp",1,inplace=True )

In [None]:
## lets check for multicollineraity.
plt.figure(figsize=(10,10))
c = X.corr()
sns.heatmap(c,cmap="BrBG",annot=True)

## As suspected Cabin is correlated with P.class so we will drop it. 

In [None]:
# Dropping unwanted features. 

#X.drop("Cabin",1,inplace=True)

# It gives better accuracy including it

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=42)



# Train model

In [None]:
model_score = []
accuracy_test = []
accuracy = []
for i in range(1,9):
    polyreg=make_pipeline(PolynomialFeatures(i,include_bias=True),LogisticRegression(max_iter=1000))
    polyreg.fit(X_train,y_train)
    model_score.append(polyreg.score(X_train,y_train))
    accuracy.append(accuracy_score(polyreg.predict(X_train),y_train))
    accuracy_test.append(accuracy_score(polyreg.predict(X_test),y_test))
    



# Results

In [None]:
print(f'this is training model score: {model_score}')
print(f'this is the accuracy on train set: {accuracy}')
print(f'this is the accuracy on test set: {accuracy_test}')


Conclusion:
1. Its clear as the degree of polynomial increases our train model overfits --> high varience. Then performs worse on test set.
2. 3 degree polynomial logistic regression fits our data best. As Accuracy highest for the respective test sets.



## Learning curves

In [None]:
from sklearn.model_selection import validation_curve, learning_curve

def draw_learning_curve(model, x, y):
    train_sizes,train_scores, test_scores = learning_curve(model, x, y,train_sizes=[50, 100, 300, 500, 600], cv=20)
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    
    plt.plot(train_sizes, train_scores_mean, color='blue', label='Train score')
    plt.plot(train_sizes, test_scores_mean, color='red', label='Cross-validation score')
    
    plt.legend(loc='best')
    plt.xlabel('Training size')
    plt.ylabel('score')



In [None]:
for i in range(1,5):
    polyreg=make_pipeline(PolynomialFeatures(i,include_bias=True),LogisticRegression(max_iter=1000))
    draw_learning_curve(polyreg,X_train, y_train)
    plt.title(f"Learning curve for {i}-degree poly Regressor")
    plt.show()

## Upvote if you like!! cheers!!!!!