In [45]:
import pandas as pd
import numpy as np
import re
# from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [46]:
train = pd.read_csv("test_twitter_x_test.csv") #10980 rows 12 cols
x_test = pd.read_csv("training_twitter_x_y_train.csv")

In [47]:
print(train['negativereason_gold'].nunique())
print(train['negativereason_gold'].value_counts(),"\n")

print(x_test.negativereason_gold.nunique())
print(x_test.negativereason_gold.value_counts())

6
Customer Service Issue                      3
Flight Attendant Complaints                 1
Late Flight                                 1
Late Flight\nLost Luggage                   1
Cancelled Flight\nCustomer Service Issue    1
Can't Tell                                  1
Name: negativereason_gold, dtype: int64 

11
Customer Service Issue                      9
Late Flight                                 3
Cancelled Flight                            3
Can't Tell                                  2
Late Flight\nFlight Attendant Complaints    1
Cancelled Flight\nCustomer Service Issue    1
Late Flight\nCancelled Flight               1
Customer Service Issue\nLost Luggage        1
Customer Service Issue\nCan't Tell          1
Bad Flight                                  1
Lost Luggage\nDamaged Luggage               1
Name: negativereason_gold, dtype: int64


## Cleaning

In [48]:
drop_cols = ['airline_sentiment_gold','name','tweet_id', 'retweet_count','tweet_created','user_timezone','tweet_coord','tweet_location']
train.drop(drop_cols, axis = 1, inplace=True)
x_test.drop(drop_cols, axis = 1, inplace=True)

In [49]:
stops = stopwords.words('english')
stops += list(punctuation)
stops += ['flight','airline','flights','AA']

In [50]:
abbreviations = {'ppl': 'people','cust':'customer','serv':'service','mins':'minutes','hrs':'hours','svc': 'service',
           'u':'you','pls':'please'}

train_index = train[~train.negativereason_gold.isna()].index
test_index = x_test[~x_test.negativereason_gold.isna()].index

for index, row in train.iterrows():
    tweet = row.text
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet) #remove links
    tweet = re.sub('@[^\s]+','',tweet) #remove usernames
    tweet = re.sub('[\s]+', ' ', tweet) #remove additional whitespaces
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #replace #word with word
    tweet = tweet.strip('\'"') #trim tweet
    words = []
    for word in tweet.split():
#         if not hasNumbers(word):
        if word.lower() not in stops:
            if word in list(abbreviations.keys()):
                words.append(abbreviations[word])
            else:
                words.append(word.lower())   
    tweet = " ".join(words)
    tweet = " %s %s" % (tweet, row.airline)
    row.text = tweet
    if index in train_index:
        row.text = " %s %s" % (row.text, row.negativereason_gold)

for index, row in x_test.iterrows():
    tweet = row.text
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet) #remove links
    tweet = re.sub('@[^\s]+','',tweet) #remove usernames
    tweet = re.sub('[\s]+', ' ', tweet) #remove additional whitespaces
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #replace #word with word
    tweet = tweet.strip('\'"') #trim tweet
    words = []
    for word in tweet.split(): 
#         if not hasNumbers(word):
        if word.lower() not in stops:
            if word in list(abbreviations.keys()):
                words.append(abbreviations[word])
            else:
                words.append(word.lower())
    tweet = " ".join(words)
    tweet = " %s %s" % (tweet, row.airline)
    row.text = tweet
    if index in test_index:
        row.text = " %s %s" % (row.text, row.negativereason_gold)

del train['negativereason_gold']
del x_test['negativereason_gold']

In [51]:
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

for index, row in train.iterrows():
    row.text = deEmojify(row.text)

for index, row in x_test.iterrows():
    row.text = deEmojify(row.text)

In [52]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

for index, row in train.iterrows():
    words = row.text.split()
    new_words = []
    for word in words:
        if not hasNumbers(word):
            new_words.append(word)
    row.text = " ".join(new_words)
    
for index, row in x_test.iterrows():
    words = row.text.split()
    new_words = []
    for word in words:
        if not hasNumbers(word):
            new_words.append(word)
    row.text = " ".join(new_words)

In [53]:
train.head()
x_test.head()

Unnamed: 0,airline_sentiment,airline,text
0,negative,Southwest,"scheduled morning, days fact, yes..not sure ev..."
1,positive,Southwest,seeing workers time time going beyond love fly...
2,positive,United,"flew ord miami back great crew, service legs. ..."
3,negative,Southwest,that's horse radish Southwest
4,negative,United,"ord delayed air force one, last sbn minutes la..."


## Creating vocab and data formatting

In [54]:
v = TfidfVectorizer(analyzer='word', max_features=3150, max_df = 0.8, ngram_range=(1,1))
train_features= v.fit_transform(train.text)
test_features=v.transform(x_test.text)

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# Assuming 'train' and 'x_test' are already loaded as DataFrames
# Example:
# train = pd.read_csv('train_data.csv')
# x_test = pd.read_csv('test_data.csv')

# Check if 'airline_sentiment' column exists
if 'airline_sentiment' not in train.columns:
    raise KeyError("The 'airline_sentiment' column is missing in the train dataset.")

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000)
train_features = vectorizer.fit_transform(train['text'])
test_features = vectorizer.transform(x_test['text'])

# Train a logistic regression model
clf = LogisticRegression(C=2.1, solver='liblinear', multi_class='auto')

# Predict sentiment on the test data
pred = clf.predict(test_features)

# Save predictions to a CSV file
with open('predictions_twitter.csv', 'w') as f:
    for item in pred:
        f.write(f"{item}\n")

KeyError: "The 'airline_sentiment' column is missing in the train dataset."

In [66]:
clf = SVC(kernel="linear", C= 0.96 , gamma = 'scale')
clf = SVC(C = 1000, gamma = 0.001)
clf.fit(train_features, train['airline_sentiment'])
pred = clf.predict(test_features)

KeyError: 'airline_sentiment'

In [60]:
with open('predictions_twitter2.csv', 'w') as f: #less accurate
    for item in pred:
        f.write("%s\n" % item)

NameError: name 'pred' is not defined

In [59]:
v.get_feature_names()

AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names'