# Proyecto 

### 1. Perfilación y preparación

En las siguientes líneas de código se importan las librerías y herramientas necesarias para desarrollar el caso de uso.

In [3]:
# Librería para manejar las contracciones que se presentan en el inglés.
!pip install contractions



In [4]:
# librería para manejar las flexiones gramaticales en el idioma inglés.
!pip install inflect
!pip install pandas-profiling==2.7.1



In [5]:
# librería Natural Language Toolkit, usada para trabajar con textos 
import nltk
# Punkt permite separar un texto en frases.
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Descarga todas las palabras vacias, es decir, aquellas que no aportan nada al significado del texto
# ¿Cuales son esas palabras vacías?

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
# Descarga de paquete WordNetLemmatizer, este es usado para encontrar el lema de cada palabra
# ¿Qué es el lema de una palabra? ¿Qué tan dificil puede ser obtenerlo, piensa en el caso en que tuvieras que escribir la función que realiza esta tarea?
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# Instalación de librerias
import pandas as pd
import numpy as np
import sys
import seaborn as sns
from pandas_profiling import ProfileReport

import re, string, unicodedata
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from pandas.core.dtypes.generic import ABCIndex
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix, plot_precision_recall_curve
from sklearn.base import BaseEstimator, ClassifierMixin
from statistics import mode
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import validation_curve
# Para búsqueda de hiperparámetros
from sklearn.model_selection import GridSearchCV
# Para la validación cruzada
from sklearn.model_selection import KFold 
# Para usar KNN como clasificador
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

Carga de los datos

In [9]:
#from google.colab import drive
#drive.mount('/content/drive')

In [10]:
# Se cargan los datos. 
#data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Proyecto/clinical_trials_on_cancer_data_clasificacion.csv', sep=',', encoding = 'utf-8')

In [11]:
# Se cargan los datos. 
data=pd.read_csv('clinical_trials_on_cancer_data_clasificacion.csv', sep=',', encoding = 'utf-8', index_col=None, low_memory=False)

In [12]:
# Cantidad de datos y número de variables
data.shape

(12000, 2)

In [13]:
# Mostrar los datos
data.head()

Unnamed: 0,label,study_and_condition
0,__label__0,study interventions are Saracatinib . recurren...
1,__label__1,study interventions are Stem cell transplantat...
2,__label__0,study interventions are Lenograstim . recurren...
3,__label__0,study interventions are Doxorubicin . stage ii...
4,__label__1,study interventions are Poly I-C . prostate ca...


In [14]:
# Es recomendable que todos los pasos preparación se realicen sobre otro archivo.
data_t = data

In [15]:
data_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   label                12000 non-null  object
 1   study_and_condition  12000 non-null  object
dtypes: object(2)
memory usage: 187.6+ KB


In [16]:
data_t['label'].value_counts()

__label__0    6000
__label__1    6000
Name: label, dtype: int64

### Limpieza de los datos
Para dejar el archivo en texto plano, sobre todo cuando vienen de diferentes fuentes como HTML, Twitter, XML, entre otros. También para eliminar caracteres especiales y pasar todo a minúscula.

In [17]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_words.append(word.lower())
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    stop = stopwords.words('english')
    
    for word in words:
        if word not in (stop):
            new_words.append(word)

    return new_words

def preprocessing(words):
    words = to_lowercase(words)
    words = replace_numbers(words)
    words = remove_punctuation(words)
    words = remove_non_ascii(words)
    words = remove_stopwords(words)
    return words

In [18]:
# Eliminación registros con ausencias
data_t = data_t.dropna()
# Eliminación de registros duplicados.
data_t = data_t.drop_duplicates()
data_t['label'].value_counts()

__label__1    5996
__label__0    5992
Name: label, dtype: int64

### Tokenización
La tokenización permite dividir frases u oraciones en palabras. Con el fin de desglozar las palabras correctamente para el posterior análisis. Pero primero, se realiza una corrección de las contracciones que pueden estar presentes en los textos. 

In [19]:
data_t['study_and_condition'] = data_t['study_and_condition'].apply(contractions.fix) #Aplica la corrección de las contracciones

In [20]:
data_t['words'] = data_t['study_and_condition'].apply(word_tokenize).apply(preprocessing) #Aplica la eliminación del ruido
data_t.head()

Unnamed: 0,label,study_and_condition,words
0,__label__0,study interventions are Saracatinib . recurren...,"[study, interventions, saracatinib, recurrent,..."
1,__label__1,study interventions are Stem cell transplantat...,"[study, interventions, stem, cell, transplanta..."
2,__label__0,study interventions are Lenograstim . recurren...,"[study, interventions, lenograstim, recurrent,..."
3,__label__0,study interventions are Doxorubicin . stage ii...,"[study, interventions, doxorubicin, stage, iii..."
4,__label__1,study interventions are Poly I-C . prostate ca...,"[study, interventions, poly, ic, prostate, can..."


Eliminar palabras repetidas en todos los registros (study, interventions)

In [21]:
new_words = []
for word in data_t['words']:
    new_words = word.remove('study')
    new_words = word.remove('interventions')
    data_t['words'] = data_t['words'].replace(new_words)
data_t.head()

Unnamed: 0,label,study_and_condition,words
0,__label__0,study interventions are Saracatinib . recurren...,"[saracatinib, recurrent, verrucous, carcinoma,..."
1,__label__1,study interventions are Stem cell transplantat...,"[stem, cell, transplantation, hodgkin, lymphom..."
2,__label__0,study interventions are Lenograstim . recurren...,"[lenograstim, recurrent, adult, diffuse, mixed..."
3,__label__0,study interventions are Doxorubicin . stage ii...,"[doxorubicin, stage, iii, diffuse, large, cell..."
4,__label__1,study interventions are Poly I-C . prostate ca...,"[poly, ic, prostate, cancer, diagnosis, unreso..."


### Normalización
En la normalización de los datos se realiza la eliminación de prefijos y sufijos, además de realizar una lemmatización.

In [22]:
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def stem_words(words):
    """Stem words in list of tokenized words"""
    #https://www.datacamp.com/community/tutorials/stemming-lemmatization-python
    porter = PorterStemmer()
    lancaster=LancasterStemmer()
    new_words = []
    for word in words:
        new_words.append(porter.stem(word))
    return new_words
        

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    #https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/
    wnl = WordNetLemmatizer()
    new_words = []
    for word in words:
        new_words.append(wnl.lemmatize(word))
    return new_words


def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems + lemmas


data_t['words'] = data_t['words'].apply(stem_and_lemmatize) #Aplica lematización y Eliminación de Prefijos y Sufijos.
data_t.head()

Unnamed: 0,label,study_and_condition,words
0,__label__0,study interventions are Saracatinib . recurren...,"[saracatinib, recurr, verruc, carcinoma, laryn..."
1,__label__1,study interventions are Stem cell transplantat...,"[stem, cell, transplant, hodgkin, lymphoma, di..."
2,__label__0,study interventions are Lenograstim . recurren...,"[lenograstim, recurr, adult, diffus, mix, cell..."
3,__label__0,study interventions are Doxorubicin . stage ii...,"[doxorubicin, stage, iii, diffus, larg, cell, ..."
4,__label__1,study interventions are Poly I-C . prostate ca...,"[poli, ic, prostat, cancer, diagnosi, unresolv..."


###  Selección de campos

Primero, se separa la variable predictora y los textos que se van a utilizar.

In [23]:
data_t['words'] = data_t['words'].apply(lambda x: ' '.join(map(str, x)))
data_t

Unnamed: 0,label,study_and_condition,words
0,__label__0,study interventions are Saracatinib . recurren...,saracatinib recurr verruc carcinoma larynx dia...
1,__label__1,study interventions are Stem cell transplantat...,stem cell transplant hodgkin lymphoma diagnosi...
2,__label__0,study interventions are Lenograstim . recurren...,lenograstim recurr adult diffus mix cell lymph...
3,__label__0,study interventions are Doxorubicin . stage ii...,doxorubicin stage iii diffus larg cell lymphom...
4,__label__1,study interventions are Poly I-C . prostate ca...,poli ic prostat cancer diagnosi unresolv ira f...
...,...,...,...
11995,__label__0,study interventions are Prednisolone hemisucci...,prednisolon hemisuccin recurr childhood larg c...
11996,__label__0,study interventions are Bevacizumab . recurren...,bevacizumab recurr rectal cancer diagnosi abso...
11997,__label__1,"study interventions are Antibodies, Monoclonal...",antibodi monoclon recurr lymphoblast lymphoma ...
11998,__label__0,study interventions are Vorinostat . colorecta...,vorinostat colorect cancer diagnosi patient mu...


In [24]:
data_t['label'] = data_t['label'].replace(['__label__1'],1)
data_t['label'] = data_t['label'].replace(['__label__0'],0)


Aplicamos TF_IDF (Term-frecuency times inverse Document-frecuency) a los datos





In [25]:
# Source: https://towardsdatascience.com/natural-language-processing-feature-engineering-using-tf-idf-e8b9d00e7e76
vectorizer = TfidfVectorizer()
allDocs = []
for word in data_t['words']:
    allDocs.append(word)
vectors = vectorizer.fit_transform(allDocs)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
data_tfidf = pd.DataFrame(denselist, columns=feature_names)
data_tfidf.head()

Unnamed: 0,01,01910na,025,05,09,0three_two9,0two_two009,10deazaaminopterin,11,12,...,zivaflibercept,zk,zoladex,zoledron,zoledronate,zoledronic,zolmitriptan,zometa,zone,zubrod
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Modelado con Support Vector Machies (Simón Guzmán L)

In [29]:
# Se selecciona la variable objetivo, en este caso "label".
Y = data_t['label']
# Se pasan como inputs los valores a los que se les aplicó TF_IDF
X = data_tfidf

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20)

In [32]:
from sklearn.model_selection import GridSearchCV
 
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
 


In [33]:
# fitting the model for grid search
grid.fit(X_train, Y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.743 total time= 7.3min
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.752 total time= 7.6min
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.746 total time= 7.9min
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.751 total time= 7.8min
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.768 total time= 7.7min
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.674 total time= 8.3min
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.670 total time= 8.2min
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.672 total time= 7.8min
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.670 total time= 7.4min
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.677 total time= 7.7min
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.502 total time= 7.7min
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [34]:
# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

{'C': 100, 'gamma': 1, 'kernel': 'rbf'}
SVC(C=100, gamma=1)


In [36]:
grid_predictions = grid.predict(X_test)
 
# print classification report
print(classification_report(Y_test, grid_predictions))

              precision    recall  f1-score   support

           0       0.83      0.81      0.82      1219
           1       0.81      0.83      0.82      1179

    accuracy                           0.82      2398
   macro avg       0.82      0.82      0.82      2398
weighted avg       0.82      0.82      0.82      2398

