In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer                       
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

In [2]:
df_input = pd.read_csv('input/AirlineTweets.csv')
df_input.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [3]:
sent2int = {
    'positive': 2,
    'neutral': 1,
    'negative': 0
}

df_input['int_class'] = df_input['airline_sentiment'].apply(lambda row: sent2int[row])
df_input[['int_class', 'airline_sentiment']].head()

Unnamed: 0,int_class,airline_sentiment
0,1,neutral
1,2,positive
2,1,neutral
3,0,negative
4,0,negative


In [4]:
X_train, X_test, y_train, y_test = traian_test_split(df_input['text'], df_input['int_class'], test_size=0.2)

In [5]:
# first, this does sentiment analysis on just the text input, not on the other features

tfidf = TfidfVectorizer()
tf_X_train = tfidf.fit_transform(X_train)
tf_X_test = tfidf.transform(X_test)

lr_text_only = LogisticRegression(max_iter=500)
lr_text_only.fit(tf_X_train, y_train)

print(f'Train accuracy: {lr_text_only.score(tf_X_train, y_train)}')
print(f'Test accuracy: {lr_text_only.score(tf_X_test, y_test)}')

Train accuracy: 0.8803790983606558
Test accuracy: 0.8029371584699454


In [6]:
# ok so I think we should add other features to the tfidf vector from the initial dataset
# e.g. airline_sentiment_confidence (float), negativereason (str), negativereason_confidence (float)

# cols = ['text', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence']

# X_train, X_test, y_train, y_test = train_test_split(df_input[cols], df_input['int_class'], test_size=0.2)

# tfidf = TfidfVectorizer()
# tf_X_train = tfidf.fit_transform(X_train['text'])
# tf_X_test = tfidf.transform(X_test['text'])

In [7]:
# asc_col_train = X_train['airline_sentiment_confidence'].to_numpy().reshape(-1,1)
# asc_col_test = X_test['airline_sentiment_confidence'].to_numpy().reshape(-1,1)
# X_train_asc = hstack((tf_X_train, asc_col_train))
# X_test_asc = hstack((tf_X_test, asc_col_test))

# lr = LogisticRegression(max_iter=500)
# lr.fit(X_train_asc, y_train)

# print(f'Train accuracy: {lr.score(X_train_asc, y_train)}')
# print(f'Test accuracy: {lr.score(X_test_asc, y_test)}')

In [8]:
# add other tf-idf'd text

cols = ['text', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence']
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(df_input[cols], df_input['int_class'], test_size=0.2)

def add_tfidf(cols, X_train, X_test):
    
    # initialize
    tfidf = TfidfVectorizer()
    X_train_total = tfidf.fit_transform(X_train[cols[0]].fillna(''))
    X_test_total = tfidf.transform(X_test[cols[0]].fillna(''))
    
    if len(cols) > 1:
        for col in cols[1:]:
            tfidf = TfidfVectorizer()
            X_train_temp = tfidf.fit_transform(X_train[col].fillna(''))
            X_test_temp = tfidf.transform(X_test[col].fillna(''))

            X_train_total = hstack((X_train_total, X_train_temp))
            X_test_total = hstack((X_test_total, X_test_temp))

    return X_train_total, X_test_total

X_train_tfidf, X_test_tfidf = add_tfidf(['text', 'negativereason'], X_train_full, X_test_full)


In [9]:
def add_float_cols(cols, X_train, X_test):
    X_train_float = X_train[cols[0]].fillna(0).to_numpy().reshape(-1,1)
    X_test_float = X_test[cols[0]].fillna(0).to_numpy().reshape(-1,1)

    if len(cols) > 1:
        for col in cols[1:]:
            temp_col_train = X_train[col].fillna(0).to_numpy().reshape(-1,1)
            temp_col_test = X_test[col].fillna(0).to_numpy().reshape(-1,1)

            X_train_float = np.hstack((X_train_float, temp_col_train))
            X_test_float = np.hstack((X_test_float, temp_col_test))

    return X_train_float, X_test_float

X_train_float, X_test_float = add_float_cols(['airline_sentiment_confidence', 'negativereason_confidence'], X_train_full, X_test_full)

In [10]:
X_train_inp = hstack((X_train_tfidf, X_train_float))
X_test_inp = hstack((X_test_tfidf, X_test_float))

In [13]:
def setup_lr(X_train, y_train):
    lr = LogisticRegression(max_iter=500)
    lr.fit(X_train, y_train)
    return lr

def score_lr(lr, X_train, X_test, y_train, y_test):
    results = f'Train accuracy: {lr.score(X_train, y_train)}\n'
    results += f'Train accuracy: {lr.score(X_test, y_test)}'
    return results

lr_full = setup_lr(X_train_inp, y_train_full)
print(score_lr(lr_full, X_train_inp, X_test_inp, y_train_full, y_test_full))

Train accuracy: 0.9696892076502732
Train accuracy: 0.9381830601092896


In [14]:
print(score_lr(lr_text_only, tf_X_train, tf_X_test, y_train, y_test))

Train accuracy: 0.8803790983606558
Train accuracy: 0.8029371584699454
