## <font color='green'> Application of Support Vector Machine to Gene Expression Data (Khan.csv)

In [None]:
import os
os.chdir('/Users/hj020/Desktop/2022/EconomicAnalytics-master/Python_/Data')

import numpy as np
import pandas as pd
import math

# 83 tissue samples are classified into four cancer types based on 2308 gene expression measurements
raw0 = pd.read_csv('Khan.csv') 

print(raw0.head())
print(raw0.shape) # high-dimensional data (large # of features)

### <font color='green'> Select a kernel function and tune the penalty parameter "C" using CV

#### <font color='green'> i) Data Preparation

In [None]:
X = raw0.iloc[:,2:]
Y = raw0.iloc[:,1]

import warnings
warnings.filterwarnings("ignore") # suppress warnings

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

#### <font color='green'> ii) Select a kernel function and tune the penalty parameter "C" using "GridSearchCV"
* SVC: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
* Available kernel functions: https://scikit-learn.org/stable/modules/svm.html#svm-kernels
* Precision & Recall: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html#sklearn.metrics.precision_recall_fscore_support

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC # support vector machines for classification (SVR is for regression)

tuned_parameters = [{'kernel': ['rbf'], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['poly'], 'degree': [5, 10, 15], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['sigmoid'], 'coef0': [0, 1, 2], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='%s_macro' % score).fit(X_train, y_train)

    print("Best parameters set found on train set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on train set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The scores are computed on test set.")
    print()
    print(classification_report(y_test, clf.predict(X_test)))
    print()

## <font color='green'> Classification of Web Documents Using Naive Bayes
* https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

#### <font color='green'> i) Import raw data (texts and their categories)
* 20 news group data : 
    - http://qwone.com/~jason/20Newsgroups/
    - https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html#sklearn.datasets.fetch_20newsgroups

In [None]:
from sklearn.datasets import fetch_20newsgroups
# fetch_20newsgroups is a function !

categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space'] # the entire data contains 20 categories but we'll be using only those categories

remove = ('headers', 'footers', 'quotes') # remove non-main text

data_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove, shuffle=True, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove, shuffle=True, random_state=42)

# extract Y and X from the datasets
Y_train = data_train.target 
Y_test = data_test.target

X_train = data_train.data 
X_test = data_test.data

In [None]:
# check how each category is indexed
data_train.target_names

In [None]:
print(X_train[0]) # text
print(Y_train) # integers (0-3)

#### <font color='green'> ii) Covert texts (bags of words) to numerical vectors
* TfidfVectorizer 
    - https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer
    
* Alternatively,
    - CountVectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer
    - HashingVectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.HashingVectorizer.html#sklearn.feature_extraction.text.HashingVectorizer
    
* Stop words: https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 

Vectorizer=TfidfVectorizer(stop_words='english')

X_train = Vectorizer.fit_transform(X_train) 
X_test = Vectorizer.transform(X_test) 

# !!!Caution: Use ".fit_transform()" for training data, but use ".transform()" for testing data
# This is to make sure the training and test sets have the same number of columns (features) 
# Here we are using the vectorizer trained for the training data to convert the testing data

# check the size of X_train
print(X_train.shape)
print(X_test.shape)

In [None]:
print(X_train[0,:])

#### <font color='green'> iii) Run NB
* https://scikit-learn.org/stable/modules/naive_bayes.html
* https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB

In [None]:
from sklearn.naive_bayes import MultinomialNB as NB
from sklearn.metrics import classification_report

NBres= NB(alpha=.01).fit(X_train, Y_train) # alpha is a kind of a shrinkage parameter

print(NBres.score(X_test, Y_test))
print(classification_report(Y_test, NBres.predict(X_test)))

In [None]:
# Run SVC on the same data
from sklearn.svm import SVC
SVCres= SVC(kernel = 'linear', C = 10).fit(X_train, Y_train)

print(SVCres.score(X_test, Y_test))
print(classification_report(Y_test, SVCres.predict(X_test)))

### <font color='darkred'> HW7: Similarly to HW6-2, optimize SVC and NB on the newsgroups data

* Select ten categories and import raw data under your categories. Follow the steps above to prepare datasets to run SVC and NB
* Use the function "GridSearchCV" to optimize SVC and NB
    - To optimize SVC, select a kernel function and tune "C" parameter
    - To optimize NB, tune "alpha" parameter
* Use both "precision" and "recall" to evaluate prediction performance