In [1]:
import pandas as pd
import re
import joblib
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings

In [2]:
# importing file
file = pd.read_csv('Accuracy_tweets_data.csv')
df = pd.DataFrame(file)
df = df.head(10)
warnings.filterwarnings('ignore')
# df['text'] = df['text'].astype(str)  

In [3]:
# taking only 2 columns which is necessary
df1 = df[['text','airline_sentiment']]

In [4]:
# cleaning text
def filter_txt(twts):
    twts = re.sub(r'@[A-Za-z0-9]+', '', twts)  # removing @mentions
    twts = re.sub(r'RT[\s]+','',twts) # removing RT
    twts = re.sub(r'#','',twts) # removing #
    twts = re.sub(r'https?://(t|www).([a-zA-Z0-9/]+)','',twts) # removing https links
    twts = re.sub(r'[\n]+','',twts) # removing \n
    twts = re.sub(r'_','',twts)
    twts = re.sub(r'^(,|.|-)','',twts)
    twts = re.sub(r'\s+',' ',twts.strip()) # trim if more then one whitespace
    twts = twts.lstrip()
    return twts

In [5]:
df1['text'] = df1['text'].apply(filter_txt)
pd.set_option('display.max_colwidth', None)
df1

Unnamed: 0,text,airline_sentiment
0,What said.,neutral
1,plus you've added commercials to the experience... tacky.,positive
2,I didn't today... Must mean I need to take another trip!,neutral
3,"it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",negative
4,and it's a really big bad thing about it,negative
5,seriously would pay $30 a flight for seats that didn't have this playing.it's really the only bad thing about flying VA,negative
6,"yes, nearly every time I fly VX this “ear worm” won’t go away :)",positive
7,"Really missed a prime opportunity for Men Without Hats parody, there.",neutral
8,"Well, I didn't…but NOW I DO! :-D",positive
9,"it was amazing, and arrived an hour early. You're too good to me.",positive


In [6]:
# df1.to_csv('howare.csv')

In [7]:
analyzer = SentimentIntensityAnalyzer()
Tweets_polarity = []
for twts in df1['text']:
    ps = analyzer.polarity_scores(twts)
    Tweets_polarity.append({'compound':ps['compound']})
    
df2 = pd.DataFrame(Tweets_polarity)

In [8]:
# Function to classify the scores as pos,neg and neutral
def get_analysis(scores):
    if scores<0:
        return 'Negative'
    elif scores==0:
        return 'Neutral'
    else:
        return 'Positive'

# Adding column
df1['Predicted'] = df2['compound'].apply(get_analysis)
df1.rename(columns={'text':'Tweets',
                    'airline_sentiment':'Actual'},inplace=True)
df1['Actual'] = df1['Actual'].str.capitalize()
df1

Unnamed: 0,Tweets,Actual,Predicted
0,What said.,Neutral,Neutral
1,plus you've added commercials to the experience... tacky.,Positive,Neutral
2,I didn't today... Must mean I need to take another trip!,Neutral,Neutral
3,"it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",Negative,Negative
4,and it's a really big bad thing about it,Negative,Negative
5,seriously would pay $30 a flight for seats that didn't have this playing.it's really the only bad thing about flying VA,Negative,Negative
6,"yes, nearly every time I fly VX this “ear worm” won’t go away :)",Positive,Positive
7,"Really missed a prime opportunity for Men Without Hats parody, there.",Neutral,Positive
8,"Well, I didn't…but NOW I DO! :-D",Positive,Positive
9,"it was amazing, and arrived an hour early. You're too good to me.",Positive,Positive


In [9]:
def accuracy_func(actual_value,predicted_value):
    correct_predictions = 0
    for true, predicted in zip(actual_value,predicted_value):
        if true == predicted:
            correct_predictions += 1
    accuracy = correct_predictions/len(predicted_value)
    return accuracy


def precision_func(actual_value,predicted_value):
    prediction_actual_positive = 0
    total_prediction_positive = 0
    for true, predicted in zip(actual_value,predicted_value):
        if true=='Positive' and predicted =='Positive':
            prediction_actual_positive += 1     
        if predicted == 'Positive':
            total_prediction_positive+=1
    precision = prediction_actual_positive/total_prediction_positive
    return precision


def recall_func(actual_value,predicted_value):
    prediction_actual_positive = 0
    total_actual_positive = 0
    for true, predicted in zip(actual_value,predicted_value):
        if true=='Positive' and predicted =='Positive':
            prediction_actual_positive += 1     
        if true=='Positive':
            total_actual_positive+=1
    recall = prediction_actual_positive/total_actual_positive
    return recall

In [10]:
# accuracy
accuracy_func(df1['Actual'],df1['Predicted'])

0.8

In [11]:
# precision
precision_func(df1['Actual'],df1['Predicted'])

0.75

In [12]:
# recall
recall_func(df1['Actual'],df1['Predicted'])

0.75

In [13]:
model = joblib.load('ml.joblib')

In [16]:
cv = TfidfVectorizer()
X = cv.fit_transform(df1['Tweets'])

In [20]:
model.predict(X)

ValueError: X has 85 features, but DecisionTreeClassifier is expecting 12177 features as input.