# Advanced Topics in Data Science (CS5661). Cal State Univ. LA, CS Dept.
### Dr. Mohammad Porhomayoun
----------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------


# Data Science in Python

#### This is a review of data sceince libraries/packages in python. Feel free to refer to the suggested resources and documentaries for more details.

---------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------


# Visualization and Model Evaluation 


###    Review: LOGISTIC REGRESSION Classifier:
#### Importing the sklearn class (machine learning algorithm) that you would like to use for modeling:

In [None]:
# Importing libraries and packages:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#### Set up the Feature Matrix and Label Vector:

In [None]:
# "read_csv" is a pandas function to read csv files from web or local device:

cancer_df = pd.read_csv('/Users/mpourho/Documents/CSU/Courses/CS5661/Datasets/Cancer.csv')

# checking the dataset by printing every 10 lines:
cancer_df.head()

In [None]:
# create a python list of feature names that would like to pick from the dataset:
feature_cols = ['Clump_Thickness','Uniformity_of_Cell_Size','Uniformity_of_Cell_Shape',
                'Marginal_Adhesion','Single_Epithelial_Cell_Size','Bare_Nuclei',
                'Bland_Chromatin','Normal_Nucleoli','Mitoses']

# use the above list to select the features from the original DataFrame
X = cancer_df[feature_cols] 

# select a Series of labels (the last column) from the DataFrame
y = cancer_df['Malignant_Cancer']

# print the first 5 rows
print(X.head())
print(y.head())

#### Splitting the Dataset:

In [None]:
# Randomly splitting the original dataset into training set and testing set:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# print the size of the traning set:
print(X_train.shape)
print(y_train.shape)

# print the size of the testing set:
print(X_test.shape)
print(y_test.shape)

## Logistic Regression Classifier
#### Defining (instantiating) an "object" from the sklearn class "LogisticRegression":

In [None]:
# "my_logreg" is instantiated as an "object" of LogisticRegression "class". 
my_logreg = LogisticRegression()


#### Training Stage: Training a predictive model using the training dataset:


In [None]:
# Training ONLY on the training set:
my_logreg.fit(X_train, y_train)


#### Testing (Prediction) Stage: Making prediction on new observations (Testing Data) using the trained model:


In [None]:
# Testing on the testing set:
y_predict_lr = my_logreg.predict(X_test)
print(y_predict_lr)

### Accuracy Evaluation:


In [None]:
# We can now compare the "predicted labels" for the Testing Set with its "actual labels" to evaluate the accuracy 

score_lr = accuracy_score(y_test, y_predict_lr)

print(score_lr)


## Estimating the Probability (likelihood) of happening an Event: 
##### As we mentioned before, Although Logistic Regression is a classifier, it can also estimate the probability of happening the event (estimating the likelihood of each label) rather than just providing a binary prediction (see Logistic Regression lecture from CS4661 for more information).
##### Thus, we can adjust the True Positive Rate (TPR = Sensitivity) and False Positive Rate (FPR = False Alarm Rate) by changing the decision Threshold (see Model Evaluation lecture from CS4661 for more information).

In [None]:
# Predicting the Binary Label:
y_predict_lr = my_logreg.predict(X_test)

# Estimating the probability (likelihood) of Each Label: 
y_predict_prob_lr = my_logreg.predict_proba(X_test)


In [None]:
# This line prints the "actual label" of the testing set:
#print(y_test)

# This line prints the "predicted label" for the testing set:
print(y_predict_lr)

# This line prints the "estimated likelihood of both label" for the testing set:
#print(y_predict_prob_lr)

# This line prints the "estimated likelihood of label=1" for the testing set:
print(y_predict_prob_lr[:,1])

# True Positive Rate (TPR) and False Positive Rate (FPR):

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predict_prob_lr[:,1], pos_label=1)

print(fpr)
print("\n\n\n")
print(tpr)

# AUC:

In [None]:
# AUC:
AUC = metrics.auc(fpr, tpr)
print(AUC)

# ROC Curve:

In [None]:
# Importing the "pyplot" package of "matplotlib" library of python to generate 
# graphs and plot curves:
import matplotlib.pyplot as plt

# The following line will tell Jupyter Notebook to keep the figures inside the explorer page 
# rather than openng a new figure window:
%matplotlib inline

plt.figure()

# Roc Curve:
plt.plot(fpr, tpr, color='red', lw=2, 
         label='ROC Curve (area = %0.2f)' % AUC)

# Random Guess line:
plt.plot([0, 1], [0, 1], color='blue', lw=1, linestyle='--')

# Defining The Range of X-Axis and Y-Axis:
plt.xlim([-0.005, 1.005])
plt.ylim([0.0, 1.01])

# Labels, Title, Legend:
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

plt.show()

### Repeating the process with smaller number of features:
##### This time using only 2 features:

In [None]:
# create a python list of feature names that would like to pick from the dataset:
feature_cols2 = ['Clump_Thickness','Mitoses']

# use the above list to select the features from the original DataFrame
X2 = cancer_df[feature_cols2] 

# select a Series of labels (the last column) from the DataFrame
y2 = cancer_df['Malignant_Cancer']

#Splitting the data
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=2)

my_logreg2 = LogisticRegression()
my_logreg2.fit(X_train2,y_train2)

# Predicting the Binary Label:
y_predict_lr2 = my_logreg2.predict(X_test2)

# Estimating the probability (likelihood) of Each Label: 
y_predict_prob_lr2 = my_logreg2.predict_proba(X_test2)

fpr2, tpr2, thresholds2 = metrics.roc_curve(y_test2, y_predict_prob_lr2[:,1], pos_label=1)
AUC2 = metrics.auc(fpr2, tpr2)
print(AUC2)

### Comparing the ROC Curves:

In [None]:
# The following line will tell Jupyter Notebook to keep the figures inside the explorer page 
%matplotlib inline

plt.figure()

# Roc Curve1:
plt.plot(fpr, tpr, color='red', lw=2, 
         label='ROC Curve (area = %0.2f)' % AUC)

# Roc Curve2:
plt.plot(fpr2, tpr2, color='green', lw=2, 
         label='ROC Curve (area = %0.2f)' % AUC2)

# Random Guess line:
plt.plot([0, 1], [0, 1], color='blue', lw=1, linestyle='--')

# Defining The Range of X-Axis and Y-Axis:
plt.xlim([-0.005, 1.005])
plt.ylim([0.0, 1.01])

# Labels, Title, Legend:
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

plt.show()