This code gives us 0.8 accruacy without too much cleaning of the data 

## Data loading and cleaning

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

We are going to distinguish two cases: tweets with negative sentiment and tweets with non-negative sentiment

In [None]:
train_data = pd.read_csv('../input/nlp-getting-started/train.csv')
test_data  =pd.read_csv('../input/nlp-getting-started/test.csv')
train_data.head(10)
train_data.dtypes

In [None]:
import re
def  clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    # remove numbers
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    
    return df
data_clean_train = clean_text(train_data, 'text', 'text_clean')
data_clean_test = clean_text(test_data, 'text', 'text_clean')



In [None]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
data_clean_train['text_clean'] = data_clean_train['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data_clean_test['text_clean'] = data_clean_test['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

## Machine Learning Model

We split the data into training and testing set:

In [None]:
train, test = train_test_split(data_clean_train, test_size=0.2, random_state=1)
X_train = train['text_clean'].values
X_test = test['text_clean'].values
y_train = train['target']
y_test = test['target']

In [None]:
test = test['text_clean']
def tokenize(text): 
    tknzr = TweetTokenizer()
    return tknzr.tokenize(text)

def stem(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

en_stopwords = set(stopwords.words("english")) 

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = en_stopwords)

We are going to use cross validation and grid search to find good hyperparameters for our SVM model. We need to build a pipeline to don't get features from the validation folds when building each training model.

In [None]:
kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [None]:
np.random.seed(1)

pipeline_svm = make_pipeline(vectorizer, 
                            SVC(probability=True, kernel="linear", class_weight="balanced"))

grid_svm = GridSearchCV(pipeline_svm,
                    param_grid = {'svc__C': [0.01, 0.1, 1]}, 
                    cv = kfolds,
                    scoring="roc_auc",
                    verbose=1,   
                    n_jobs=-1) 

grid_svm.fit(X_train, y_train)
grid_svm.score(X_test, y_test)

In [None]:
grid_svm.best_params_

In [None]:
grid_svm.best_score_

In [None]:
def report_results(model, X, y):
    pred_proba = model.predict_proba(X)[:, 1]
    pred = model.predict(X)        

    auc = roc_auc_score(y, pred_proba)
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred)
    prec = precision_score(y, pred)
    rec = recall_score(y, pred)
    result = {'auc': auc, 'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
    return result

Let's see how the model (with the best hyperparameters) works on the test data:

In [None]:
report_results(grid_svm.best_estimator_, X_test, y_test)

In [None]:
#Another way to do it 
from sklearn.metrics import classification_report
pred = grid_svm.predict(X_test)
print(classification_report(y_test, pred))

In [None]:
def get_roc_curve(model, X, y):
    pred_proba = model.predict_proba(X)[:, 1]
    fpr, tpr, _ = roc_curve(y, pred_proba)
    return fpr, tpr

In [None]:
roc_svm = get_roc_curve(grid_svm.best_estimator_, X_test, y_test)

In [None]:
fpr, tpr = roc_svm
plt.figure(figsize=(14,8))
plt.plot(fpr, tpr, color="red")
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Roc curve')
plt.show()

Let's see if our model has some bias or variance problem ploting its learning curve:

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = \
    learning_curve(grid_svm.best_estimator_, X_train, y_train, cv=5, n_jobs=-1, 
                   scoring="roc_auc", train_sizes=np.linspace(.1, 1.0, 10), random_state=1)

In [None]:
def plot_learning_curve(X, y, train_sizes, train_scores, test_scores, title='', ylim=None, figsize=(14,8)):

    plt.figure(figsize=figsize)
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="lower right")
    return plt

In [None]:
plot_learning_curve(X_train, y_train, train_sizes, 
                    train_scores, test_scores, ylim=(0.7, 1.01), figsize=(14,6))
plt.show()

It looks like there isn't a big bias or variance problem, but it is clear that our model would work better with more data:. if we can get more labeled data the model performance will increase.

## Examples

We are going to apply the obtained machine learning model to some example text. If the output is **1** it means that the text has a negative sentiment associated:

In [None]:
grid_svm.predict(["flying with @united is always a great experience"])

In [None]:
grid_svm.predict(["flying with @united is always a great experience. If you don't lose your luggage"])

In [None]:
grid_svm.predict(["I love @united. Sorry, just kidding!"])

In [None]:
grid_svm.predict(["@united very bad experience!"])

In [None]:
grid_svm.predict(["@united very bad experience!"])

In [None]:
submission_test_clean = test_data.copy()
submission_test_clean = clean_text(submission_test_clean, "text","text_clean")
submission_test_clean['text_clean'] = submission_test_clean['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
submission_test_clean = submission_test_clean['text_clean']
submission_test_clean.head()

In [None]:
submission_test_pred = grid_svm.predict(submission_test_clean)

In [None]:
id_col = test_data['id']
submission_df_1 = pd.DataFrame({
                  "id": id_col, 
                  "target": submission_test_pred})
submission_df_1.head()

In [None]:
submission_df_1.to_csv("submisssions.csv", index=False)