## API de Ressarcimento Sinistro Automóvel (NLP de ressarcimento do sinistro)

#### Probabilidade de um sinistro ser ressarcível

In [31]:
# import important modules
import numpy as np
import pandas as pd

# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB # classifier 

from sklearn.metrics import (
    accuracy_score,
    classification_report
)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# text preprocessing modules
from string import punctuation 

# text preprocessing modules
from nltk.tokenize import word_tokenize

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re #regular expression

# Download dependency
for dependency in (
    "brown",
    "names",
    "wordnet",
    "averaged_perceptron_tagger",
    "universal_tagset",
):
    nltk.download(dependency)
    
import warnings
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)

import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package brown to /home/ubuntu/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package names to /home/ubuntu/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [32]:
# load data
colnames=['DESCRICAO', 'SITUAÇÃO DO SINISTRO']
colrenames={'DESCRICAO': 'review','SITUAÇÃO DO SINISTRO': 'situacao'}
data = pd.read_csv("../data/base_de_descricoes.csv", usecols=colnames)
data.rename(columns=colrenames, inplace=True)
data['situacao'] = data['situacao'].replace('SEGURADO CULPADO', 'BAIXADO COM RESSARCIMENTO')
data['sentiment'] = data['situacao'].replace('BAIXADO COM RESSARCIMENTO', 1).replace('BAIXADO SEM RESSARCIMENTO', 0)

In [None]:
# show top five rows of data
data.dropna(inplace=True)
data.head() 


In [34]:
# check the shape of the data
data.shape

(87885, 3)

In [35]:
# check missing values in data
data.isnull().sum()

review       0
situacao     0
sentiment    0
dtype: int64

In [36]:
# evalute news sentiment distribution
data.sentiment.value_counts()

sentiment
0    78181
1     9704
Name: count, dtype: int64

### Preparação de dados

In [37]:
stop_words =  stopwords.words('english')

def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
    # Clean the text, with the option to remove stop_words and to lemmatize word

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    text =  re.sub(r'http\S+',' link ', text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if lemmatize_words:
        text = text.split()
        lemmatizer = WordNetLemmatizer() 
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(lemmatized_words)
    
    # Return a list of words
    return(text)

In [None]:
#clean the review
data["cleaned_review"] = data["review"].apply(text_cleaning)
data

In [None]:
#split features and target from train data 
#features = pd.DataFrame(['']*len(data), index=data.cleaned_review, columns=[''])
features = data["cleaned_review"]
target = data.sentiment.values
features

In [41]:
# split data into train and validate

X_train, X_test, y_train, y_test= train_test_split(
    features,
    target,
    test_size=0.30,
    random_state=42,
    shuffle=True,
    stratify=target,
)

In [42]:
print(f"X_train dataset features size: {X_train.shape}")
print(f"X_test dataset label size: {X_test.shape}")
print(f"y_train dataset features size: {y_train.shape}")
print(f"y_test dataset features size: {y_test.shape}")

X_train dataset features size: (61519,)
X_test dataset label size: (26366,)
y_train dataset features size: (61519,)
y_test dataset features size: (26366,)


### Create Classifier 

In [43]:
# The multinomial RandomForestClassifier is suitable for classification with discrete features
# (e.g., word counts for text classification).

# Create a classifier in pipeline
sentiment_classifier = Pipeline(steps=[
                                 ('pre_processing',TfidfVectorizer(lowercase=False)),
                                 ('clf',RandomForestClassifier())
                                 ])

In [44]:
# train the sentiment classifier 
sentiment_classifier.fit(X_train,y_train)

In [45]:
# test model performance on valid data 
y_preds = sentiment_classifier.predict(X_test)

In [46]:
# evalute model performance by using log_loss in the validation data
accuracy_score(y_test,y_preds)

0.8896685124781916

In [47]:
# plot the comfusion matrix
#plot_confusion_matrix(sentiment_classifier, X_valid, y_valid, normalize='true')

In [48]:
# check the classification report
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     23455
           1       1.00      0.00      0.00      2911

    accuracy                           0.89     26366
   macro avg       0.94      0.50      0.47     26366
weighted avg       0.90      0.89      0.84     26366



In [49]:
#save model 
import joblib 

joblib.dump(sentiment_classifier, '../models/sentiment_model_pipeline.pkl')

['../models/sentiment_model_pipeline.pkl']

In [50]:
import joblib
pipe = joblib.load('../models/sentiment_model_pipeline.pkl'
)

pipe

In [None]:

posicao = 66 # 0 NA0 5 SIM 88 (SIM API com erro)

descricaoSinistro = data["cleaned_review"][posicao]
# clean the review
cleaned_review = text_cleaning(descricaoSinistro)
cleaned_review = [cleaned_review]
print(cleaned_review)
print(data['situacao'][posicao])
 

In [52]:
previsao = pipe.predict(cleaned_review)
output = int(previsao[0])
probas = pipe.predict_proba(cleaned_review)
output_probability = "{:.2f}".format(float(probas[:, output]))

# output dictionary
sentiments = {0: "NAO", 1: "SIM"}

# show results
result = {"ressarcivel": sentiments[output], "probabilidade": output_probability}
result

{'ressarcivel': 'SIM', 'probabilidade': '0.66'}