In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest

import string

import warnings
warnings.filterwarnings("ignore")

import xml.etree.ElementTree as ET
import seaborn as sns

from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
# Competición kaggle

# https://www.kaggle.com/c/the-bridge-nlp/data?select=train.csv

## Carga y limpieza datos:

In [2]:
df_train = pd.read_csv('data/train.csv')

In [3]:
df_train = df_train[['target', 'text']]

In [4]:
stop_words = stopwords.words("english")

def text_preproc(x):
    x = x.lower()
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    x = x.encode('ascii', 'ignore').decode()
    x = re.sub(r'https*\S+', ' ', x)
    x = re.sub(r'http*\S+', ' ', x)
    x = re.sub(r'@\S+', ' ', x)
    x = re.sub(r'#\S+', ' ', x)
    x = re.sub(r'\'\w+', '', x)
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    return x

df_train['text'] = df_train['text'].apply(text_preproc)

In [233]:
#for i in range(0, 200):
#    print(i, df_train['text'][i])

0 deeds reason may allah forgive us
1 forest fire near la ronge sask canada
2 residents asked place notified officers evacuation shelter place orders expected
3  update california hwy closed directions due lake county fire 
4 i top hill see fire woods 
5 three people died heat wave far
6  bago myanmar arrived bago
7 damage school bus multi car crash 
8 what man 
9 love fruits
10 summer lovely
11 car fast
12 ridiculous 
13 london cool 
14 love skiing
15 wonderful day 
16 nyc last week 
17 love girlfriend
18 like pasta 
19 always try bring heavy 
20  breaking news nigeria flag set ablaze aba 
21 plus side look sky last night ablaze 
22  they built much hype around new acquisitions doubt set epl ablaze season 
23 inec office abia set ablaze 
24 barbados jamaica two cars set ablaze santa cruz head st elizabeth police superintende 
25 ablaze lord d
26 check out 
27 outside ablaze alive
but dead inside
28 awesome time visiting cfc head office ancop site ablaze thanks tita vida taking care us

## Entrenamiento Lemmatizer 


In [5]:
X = df_train[['text']]
y = df_train[['target']]

In [6]:
from nltk.stem import WordNetLemmatizer

def english_lemmatizer(x):
    lemmer = WordNetLemmatizer()
    return ' '.join([lemmer.lemmatize(word) for word in x.split()])

X['text'] = X['text'].apply(english_lemmatizer)

In [7]:
#Primera opción: Usando CountVectorizer y GridSearchCV
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('svm', SVC(probability=True))
])

parameters = {
    'vect__max_df': ([2.3]),
    'vect__min_df': ([15]),
    'vect__max_features': ([1500]),
    'vect__ngram_range': ((1, 2), (1, 3)),
    'svm__kernel': ('linear', 'rbf', 'sigmoid'),
    'svm__C': (0.5, 1, 2),
    'svm__gamma': ('scale', 'auto')
}

grid_search = GridSearchCV(pipeline,
                          parameters,
                          cv=3,
                          n_jobs=-1,
                          scoring='roc_auc') #Nos saca línea bajo al curva

In [8]:
#Segunda opción: Usando TfidfVectorizer y RandomizedSearchCV (la mejor combinación)
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('svm', SVC(probability=True))
])

parameters = {
    'vect__max_df': ([2.3]),
    'vect__min_df': ([15]),
    'vect__max_features': ([1500]),
    'vect__ngram_range': ((1, 2), (1, 3)),
    'svm__kernel': ('linear', 'rbf', 'sigmoid'),
    'svm__C': (0.5, 1, 2),
    'svm__gamma': ('scale', 'auto')
}

grid_search = RandomizedSearchCV(pipeline,
                          parameters,
                          cv=3,
                          n_jobs=-1,
                          scoring='roc_auc') #Nos saca la línea bajo la curva

In [9]:
grid_search.fit(X['text'], y['target'])

RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('vect', TfidfVectorizer()),
                                             ('svm', SVC(probability=True))]),
                   n_jobs=-1,
                   param_distributions={'svm__C': (0.5, 1, 2),
                                        'svm__gamma': ('scale', 'auto'),
                                        'svm__kernel': ('linear', 'rbf',
                                                        'sigmoid'),
                                        'vect__max_df': [2.3],
                                        'vect__max_features': [1500],
                                        'vect__min_df': [15],
                                        'vect__ngram_range': ((1, 2), (1, 3))},
                   scoring='roc_auc')

In [10]:
print("Best params:", grid_search.best_params_)
print()
print("Best auc:", grid_search.best_score_)
print()
print("Best model:", grid_search.best_estimator_)

Best params: {'vect__ngram_range': (1, 3), 'vect__min_df': 15, 'vect__max_features': 1500, 'vect__max_df': 2.3, 'svm__kernel': 'sigmoid', 'svm__gamma': 'scale', 'svm__C': 0.5}

Best auc: 0.7263076416991963

Best model: Pipeline(steps=[('vect',
                 TfidfVectorizer(max_df=2.3, max_features=1500, min_df=15,
                                 ngram_range=(1, 3))),
                ('svm', SVC(C=0.5, kernel='sigmoid', probability=True))])


## CONVERT DATA TEST

In [11]:
df_test = pd.read_csv('data/test.csv')
df_test = df_test[['text']]

In [12]:
stop_words = stopwords.words("english")

def text_preproc(x):
    x = x.lower()
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    x = x.encode('ascii', 'ignore').decode()
    x = re.sub(r'https*\S+', ' ', x)
    x = re.sub(r'http*\S+', ' ', x)
    x = re.sub(r'@\S+', ' ', x)
    x = re.sub(r'#\S+', ' ', x)
    x = re.sub(r'\'\w+', '', x)
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    return x


df_test['text'] = df_test['text'].apply(text_preproc)

In [13]:
predictions = grid_search.best_estimator_.predict_proba(df_test['text']) #usamos el predict_proba porque queremos saber la probabilidad de que sea un desastre 1 o no sea un
#desastre 0. Nos genera un array de 2 columnas, nos tenemos que quedar con la segunda columna que es la que indica la probabilidad

In [14]:
predictions

array([[0.05096029, 0.94903971],
       [0.86117074, 0.13882926],
       [0.29861072, 0.70138928],
       ...,
       [0.14413476, 0.85586524],
       [0.30109661, 0.69890339],
       [0.14416623, 0.85583377]])

In [15]:
df_test_pr = pd.DataFrame(predictions)

In [16]:
df_test_pr

Unnamed: 0,0,1
0,0.050960,0.949040
1,0.861171,0.138829
2,0.298611,0.701389
3,0.141774,0.858226
4,0.308186,0.691814
...,...,...
2262,0.144145,0.855855
2263,0.245614,0.754386
2264,0.144135,0.855865
2265,0.301097,0.698903


In [17]:
df_test_pr = df_test_pr.iloc[:,1:2]
df_test_pr

Unnamed: 0,1
0,0.949040
1,0.138829
2,0.701389
3,0.858226
4,0.691814
...,...
2262,0.855855
2263,0.754386
2264,0.855865
2265,0.698903


## Guardar submission:

In [18]:
df_test_or = pd.read_csv('data/test.csv')

In [19]:
df_test_04 = pd.DataFrame(df_test_or['id'])

In [20]:
df_test_01 = pd.DataFrame()

In [21]:
df_test_01 = pd.concat([df_test_04, df_test_pr[1]], axis=1)

In [22]:
df_test_01.set_index(['id'], inplace=True)

In [23]:
df_test_01.rename(columns={1:'target'}, inplace=True)

In [24]:
df_test_01.to_csv('test_09.csv')

In [25]:
df_test_01

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
6,0.949040
7,0.138829
10,0.701389
14,0.858226
15,0.691814
...,...
10852,0.855855
10855,0.754386
10859,0.855865
10867,0.698903
