In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib

import urllib.request # request library for downloading a url
import os.path

from sklearn.model_selection import train_test_split #Train-test data split


In [None]:
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10, 8) # set default figure size, 8in by 6in

# Example Download of UCI Data Set

Here is an example of downloading a file from an internet URL address, then loading it into
a pandas dataframe.

In [None]:
# create a report hook function, so that the urlretrieve() can display
# a status report while downloading
def urlretrieve_reporthook(block_number, read_size, total_file_size):
    if block_number % 100 == 0:
        print("\rReading %d / %d complete" % (read_size * block_number, total_file_size), end="")

In [None]:
# the UCI datasets have been pre-divided into test and training sets
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00383/risk_factors_cervical_cancer.csv'
train_file = './data/risk_factors_cervical_cancer.csv'

# download the training data csv (comma separated values) file into our data folder
# I picked a relatively large dataset file/example here (45 MB), so this may take a bit of time to
# download on a slow connection.
# always good to check and only download if we don't already have the file, so we can more easily
# rerun all cells without causing a long download to be done every time
if not os.path.exists(train_file):
    print('Beginning file download with urllib2...')
    urllib.request.urlretrieve(url, train_file, reporthook=urlretrieve_reporthook)

# Introduction 

# Problem space
      Here I am using risk_factors_cervical_cancer data, and it has 36 columns and 858 rows. There are 32 input features and 4 output features available. We would want here to find out the relavant features and try to create model that fits well with data. I am going to try all the different models and compare the accuracy and then decide which one fits the data without noice or loss of data.
      Using this cervical cancer dataset, I will first try to explore the data to identify the primary or required features, and then use them to create a model.
  
      At first, I will do these tasks.
          > Data loading
          > Data cleaning
          > Data preprocessing
          > Data modeling

In [None]:
# load the csv file into a pandas dataframe
# the train file we receive has 20 lines of copyright/header information we need to skip over
# also the csv file uses na to represent missing data, which is not interpreted as a missing by
# pandas by default.  By specifying this as a na_values, all of the columnes are interpreted 
# as numberic types and NaN are the numeric values given to the missing data.

cancerData = pd.read_csv(train_file)
print(cancerData.info())
#print(train.head())

# 1. Data Cleaning
    As first step, we are making data to make more sense so that the features can give valid accuracy and predict the data well.

# 1 (a) Finding missing values
    In my dataset had '?' instead of missing values for some reason. I am trying here to replace them as NA(missing values)n and then trying to replace missing values. Replacing can be done in these ways. 
    1. Convert categorical to Numerical (cancerData doesn't have any categorical data, it's all numeric. So I can ignore this step)
    2. Remove irrelavant columns (optional), "STDs: Time since first diagnosis" and "STDs: Time since last diagnosis" are mostly having missing values and it need not be relavant for my model. So , i removed(dropped) irrelavant columns.

In [None]:
#dropping the columns below because these have almost 90% of missing data
#isnull().sum() shows what all columns has NA values and how many rows NA data exists.
cancerData.drop(columns=['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'], axis=1, inplace=True,errors='raise')
cancerData.replace('?', np.nan, inplace=True)
print(cancerData.isnull().sum())
cancerData = cancerData.convert_objects(convert_numeric=True)

# 1 (b) Resolving Missing values
       Replacing the missing values with 0's or mean() or median() whichever is suitable for the column. 
       1. "Age" column doesnt have any missing data.  
       2. "Number of sexual partners" and "First sexual intercourse" NAs are replcaed with mean() values as they dont have outliers.
       3. "Number of pregnancies" , "IUD" and "IUD(years)" are replaced with 0
       4. remaining all other columns are replaced with median() values as they may have much outliers.
       

In [None]:
cancerData['Number of sexual partners'].fillna(cancerData["Number of sexual partners"].mean(), inplace=True)
cancerData['First sexual intercourse'].fillna(cancerData["First sexual intercourse"].mean(), inplace=True)
cancerData['Num of pregnancies'].fillna(0, inplace=True)

cancerData['IUD'].fillna(0, inplace=True)
cancerData['IUD (years)'].fillna(0, inplace=True)

cancerData['Smokes'].fillna(cancerData["Smokes"].median(), inplace=True)
cancerData['Smokes (years)'].fillna(cancerData["Smokes (years)"].median(), inplace=True)
cancerData['Smokes (packs/year)'].fillna(cancerData["Smokes (packs/year)"].median(), inplace=True)

cancerData['Hormonal Contraceptives'].fillna(cancerData["Hormonal Contraceptives"].median(), inplace=True)
cancerData['Hormonal Contraceptives (years)'].fillna(cancerData["Hormonal Contraceptives (years)"].median(), inplace=True)

cancerData['STDs'].fillna(cancerData["STDs"].median(), inplace=True)
cancerData['STDs (number)'].fillna(cancerData["STDs (number)"].median(), inplace=True)
#cancerData['STDs:condylomatosis'].fillna(cancerData["STDs:condylomatosis"].median(), inplace=True)
#cancerData['STDs:cervical condylomatosis'].fillna(cancerData["STDs:cervical condylomatosis"].median(), inplace=True)
cancerData['STDs:vaginal condylomatosis'].fillna(cancerData["STDs:vaginal condylomatosis"].median(), inplace=True)
cancerData['STDs:vulvo-perineal condylomatosis'].fillna(cancerData["STDs:vulvo-perineal condylomatosis"].median(), inplace=True)
cancerData['STDs:syphilis'].fillna(cancerData["STDs:syphilis"].median(), inplace=True)
cancerData['STDs:pelvic inflammatory disease'].fillna(cancerData["STDs:pelvic inflammatory disease"].median(), inplace=True)           
cancerData['STDs:genital herpes'].fillna(cancerData["STDs:genital herpes"].median(), inplace=True)
cancerData['STDs:molluscum contagiosum'].fillna(cancerData["STDs:molluscum contagiosum"].median(), inplace=True)
cancerData['STDs:AIDS'].fillna(cancerData["STDs:AIDS"].median(), inplace=True)
cancerData['STDs:HIV'].fillna(cancerData["STDs:HIV"].median(), inplace=True)
cancerData['STDs:Hepatitis B'].fillna(cancerData["STDs:Hepatitis B"].median(), inplace=True) 
cancerData['STDs:HPV'].fillna(cancerData["STDs:HPV"].median(), inplace=True) 
cancerData['STDs:condylomatosis'].fillna(cancerData["STDs:HPV"].median(), inplace=True) 

cancerData['STDs:cervical condylomatosis'].fillna(cancerData["STDs:HPV"].median(), inplace=True) 

In [None]:
# show some information about the data
num_samples, num_features = cancerData.shape
print("Number of features:", num_features)
print("number of training samples:", num_samples)

In [None]:
#print the datatypes of each column
print(cancerData.dtypes)

# The count for describe shows the total present values for each feature out of the 60000 samples for each one.
cancerData.describe()

In [None]:
# the first column is actually the label/target we would want to use if we were to build a classifier.
#Biopsy has 0 or 1 values which we can classify as binary classification
#We have 4 to 5 output variables that declares the cancer confirmation, but as of now i am using 'Biopsy' as output.
np.unique(cancerData['Biopsy'])

In [None]:
# we would want to remove this column from the training data, and create a y (training labels), that uses 0 for
#removed last feature, and added to y. Remaing columns are added to input features x
x = cancerData.iloc[:,:-1]
print(x.shape)
y = cancerData.Biopsy.values
print(y.shape)

## Plot data
Using histogram and boxplot on age just for understanding

In [None]:
# Plot the joint plots using sns 
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.boxplot([cancerData.iloc[:,0], y])

In [None]:
#Histogram for all the columns
cancerData.hist()
plt.show()

## Test-Train data split
1. Split the data to train and test set with 80-20 ratio
2. plot the train data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=19)
print(x_test.shape)
print(x_train.shape)

## Data standardization
    Cancer dataset has more skewy data, so iam trying to standardise so that it's easier to estimate or compare.
    I am trying to standardise the whole data to understand first, later I will be using the scalar function on train and test data seperately.


In [None]:
from sklearn import preprocessing

# Get column names first
names = cancerData.columns
# Create the Scaler object
std_scaler = preprocessing.StandardScaler()
# Fit data on the scaler object
scaled_cancerData = std_scaler.fit_transform(cancerData)
#scaled_cancerData = pd.DataFrame(scaled_df, columns=names)
print(scaled_cancerData.data.shape)

## Modeling data
We can apply unsupervised learning algorithms to unserstand the sensitivity of the data. But here I am using normal regressions and classifiers and then comparing the score values of each.    
I am applying regression and classificarion models to see which one is suitable for my data.

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression

model_1 = LinearRegression()
model_1.fit(x_train, y_train)

print(model_1.intercept_)
print(model_1.coef_)
y_pred = model_1.predict(x_test)
model_1.score(x_test, y_test)

In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

model_2 = LogisticRegression(solver='lbfgs', max_iter=600)
model_2.fit(x_train, y_train)

print(model_2.intercept_)
print(model_2.coef_)
y_pred = model_2.predict(x_test)
print(y_pred)
model_2.score(x_test, y_test)

In [None]:
#SVM model - For classification (Linear kernel)
from sklearn.preprocessing import PolynomialFeatures

from sklearn import svm
linclf = svm.SVC(kernel='linear', C=1e6)
linclf.fit(x_train, y_train)
print((linclf.coef_))  # show the coefficients that were fitted to the data by logistic regression
print((linclf.intercept_))


#Here , classification and Regression are having same r squared values.
linclf.score(x_test, y_test)


In [None]:
# Poly - kernel SVM
#Lets try to improve by poly kernel and see what happens
linclf2 = svm.SVC(kernel='poly', C=1e6)
linclf2.fit(x_train, y_train)
print((linclf2.intercept_))

linclf2.score(x_test, y_test)

## Conclusion:
    risk_factors_cervical_cancer has 34 features with 858 samples. Output feature is "Biopsy" and there are many dependant variables for out feature. 
    R-square values of each model are:
    Linear Regression = 20.27% 
    Logistic Regression = 94.18%
    Linear SVM = 94.18%
    Poly kernal SVM = 90.69%
    
    So, we can clearly say that the data is suitable for classification or Logistic regression.