In [1]:
# General:
import tweepy           # To consume Twitter's API
import pandas as pd     # To handle data
import numpy as np      # For number computing

# For plotting and visualization:
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df=pd.read_csv('twitter.csv', engine='python')


In [3]:
print(df.head())

   sentiment                                              tweet
0          5  Two places I'd invest all my money if I could:...
1          5  Awesome! Google driverless cars will help the ...
2          5  Autonomous vehicles could reduce traffic fatal...
3          5  Really good presentation from Jan Becker on Bo...
4          5  Ford just revealed it's Automated Ford Fusion ...


In [4]:
df.shape

(2664, 2)

In [5]:
type(df)

pandas.core.frame.DataFrame

In [6]:
df['neg_senti']= np.where(df['sentiment']<3, 1, 0)

In [7]:
df.head()

Unnamed: 0,sentiment,tweet,neg_senti
0,5,Two places I'd invest all my money if I could:...,0
1,5,Awesome! Google driverless cars will help the ...,0
2,5,Autonomous vehicles could reduce traffic fatal...,0
3,5,Really good presentation from Jan Becker on Bo...,0
4,5,Ford just revealed it's Automated Ford Fusion ...,0


In [8]:
df[df.neg_senti==1].count()

sentiment    775
tweet        775
neg_senti    775
dtype: int64

In [9]:
df1=df.tweet
df1.head()

0    Two places I'd invest all my money if I could:...
1    Awesome! Google driverless cars will help the ...
2    Autonomous vehicles could reduce traffic fatal...
3    Really good presentation from Jan Becker on Bo...
4    Ford just revealed it's Automated Ford Fusion ...
Name: tweet, dtype: object

In [10]:
from textblob import TextBlob
import re

def clean_tweet(tweet):
    '''
    Utility function to clean the text in a tweet by removing 
    links and special characters using regex.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def analize_sentiment(tweet):
    '''
    Utility function to classify the polarity of a tweet
    using textblob.
    '''
    analysis = TextBlob(clean_tweet(tweet))
    if analysis.sentiment.polarity < 0:
        return 1
    else:
        return 0


In [11]:
# We create a column with the result of the analysis:
df['SA'] = np.array([ analize_sentiment(tweet) for tweet in df['tweet'] ])

# We display the updated dataframe with the new column:
display(df.head(10))

Unnamed: 0,sentiment,tweet,neg_senti,SA
0,5,Two places I'd invest all my money if I could:...,0,0
1,5,Awesome! Google driverless cars will help the ...,0,0
2,5,Autonomous vehicles could reduce traffic fatal...,0,0
3,5,Really good presentation from Jan Becker on Bo...,0,0
4,5,Ford just revealed it's Automated Ford Fusion ...,0,0
5,5,"So yeah, just throwing this out there again. W...",0,0
6,5,@TeslaMotors Musk reluctant to partner with Ap...,0,0
7,5,Finished SF&gt;LA drive. Now in rush hour for ...,0,0
8,5,The #Google autonomous car paid a visit to Nvi...,0,0
9,5,"Finally, a very realistic timeline for full au...",0,0


In [12]:
df['pre_clean_len'] = [len(t) for t in df.tweet]
df.head()

Unnamed: 0,sentiment,tweet,neg_senti,SA,pre_clean_len
0,5,Two places I'd invest all my money if I could:...,0,0,83
1,5,Awesome! Google driverless cars will help the ...,0,0,95
2,5,Autonomous vehicles could reduce traffic fatal...,0,0,68
3,5,Really good presentation from Jan Becker on Bo...,0,0,102
4,5,Ford just revealed it's Automated Ford Fusion ...,0,0,124


In [13]:
df.sort_values(by='pre_clean_len', ascending=False).head()

Unnamed: 0,sentiment,tweet,neg_senti,SA,pre_clean_len
543,4,"""The idea is to challenge traditional automoti...",0,0,145
1704,4,@Lisalynn2000 @cnnmoney Ha. It's coming. Self-...,0,0,144
2160,2,G-creepy self-driving car has Google street ma...,1,1,144
752,4,@CBCNews this am 2 car accidents &amp; 1 near ...,0,0,144
654,4,"@cityslikr: Yes ""still cars"", BUT combine a Dr...",0,1,144


In [14]:
df.groupby('pre_clean_len').size().sort_values(ascending=False).head()

pre_clean_len
140    104
139     85
138     73
136     63
133     56
dtype: int64

In [15]:
from nltk.tokenize import WordPunctTokenizer
from bs4 import BeautifulSoup
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))
#tweet cleaner 2
def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    refined_words=[]
    for word in words:
        if len(word)>2:
            refined_words.append(word)
    return (" ".join(refined_words)).strip()
testing = df.tweet[:100]
test_result = []
for t in testing:
    test_result.append(tweet_cleaner(t))
test_result

['two places invest all money could printing and self driving cars',
 'awesome google driverless cars will help the blind travel more often',
 'autonomous vehicles could reduce traffic fatalities',
 'really good presentation from jan becker bosch automated vehicle research autoauto check out',
 'ford just revealed automated ford fusion hybrid vehicle pretty amazing fordtrends ford test',
 'yeah just throwing this out there again would totally down beta test autonomous car',
 'musk reluctant partner with apple google but android controlled autonomous smart car would awesome',
 'finished drive now rush hour for meeting cant wait for autonomous google car',
 'the google autonomous car paid visit nvidia pretty cool technology',
 'finally very realistic timeline for full autonomous car capability hats off autoforum',
 'cool when cars are fully autonomous cause totally gonna sleep while car drives',
 'autonomous car driving could free the billion that wasted car insurance',
 'awesome google 

In [16]:
#2nd analyzer
def analize_sentiment2(tweet):
    '''
    Utility function to classify the polarity of a tweet
    using textblob.
    '''
    analysis = TextBlob(tweet_cleaner(tweet))
    if analysis.sentiment.polarity < 0:
        return 1
    else:
        return 0

    # We create a column with the result of the analysis:
df['SA2'] = np.array([ analize_sentiment2(tweet) for tweet in df['tweet'] ])

# We display the updated dataframe with the new column:
display(df.head(10))

Unnamed: 0,sentiment,tweet,neg_senti,SA,pre_clean_len,SA2
0,5,Two places I'd invest all my money if I could:...,0,0,83,0
1,5,Awesome! Google driverless cars will help the ...,0,0,95,0
2,5,Autonomous vehicles could reduce traffic fatal...,0,0,68,0
3,5,Really good presentation from Jan Becker on Bo...,0,0,102,0
4,5,Ford just revealed it's Automated Ford Fusion ...,0,0,124,0
5,5,"So yeah, just throwing this out there again. W...",0,0,98,0
6,5,@TeslaMotors Musk reluctant to partner with Ap...,0,0,125,0
7,5,Finished SF&gt;LA drive. Now in rush hour for ...,0,0,98,0
8,5,The #Google autonomous car paid a visit to Nvi...,0,0,76,0
9,5,"Finally, a very realistic timeline for full au...",0,0,129,0


In [25]:
df['tweet'].isnull().sum()

0

In [17]:
df.groupby('neg_senti').size()

neg_senti
0    1889
1     775
dtype: int64

In [18]:
df['cleaned_tweet']= np.array([ tweet_cleaner(tweet) for tweet in df['tweet'] ])
titles = df['cleaned_tweet'].fillna('')

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features = 300, 
                             ngram_range=(1, 1), 
                             stop_words='english',
                             binary=True)

# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(titles)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles)

In [None]:
#Dimensionality Reduction on bag of words


In [19]:
# model random forest
from sklearn.ensemble import RandomForestClassifier
modelrf = RandomForestClassifier(n_estimators = 20, random_state=0, n_jobs=-1, class_weight='balanced')

# model logistic regression
from sklearn.linear_model import LogisticRegression
modellr = LogisticRegression(class_weight= 'balanced') 

# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(titles)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles).toarray()
y = df['neg_senti']

from sklearn.cross_validation import cross_val_score
model= modellr
scoresR = cross_val_score(model, X, y, cv=30, scoring='recall')
print('CV Recall {}, Average Recall {}'.format(scoresR, scoresR.mean()))
# F1 = (2 x recall x precision) / (recall + precision)
scoresf1 = cross_val_score(model, X, y, cv=30, scoring='f1')
print('CV F1 {}, Average F1 {}'.format(scoresf1, scoresf1.mean()))
scoresAUC = cross_val_score(model, X, y, cv=30, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scoresAUC, scoresAUC.mean()))



CV Recall [0.73076923 0.76923077 0.73076923 0.65384615 0.65384615 0.76923077
 0.80769231 0.61538462 0.26923077 0.46153846 0.5        0.57692308
 0.53846154 0.69230769 0.73076923 0.65384615 0.57692308 0.80769231
 0.76923077 0.65384615 0.65384615 0.69230769 0.80769231 0.61538462
 0.65384615 0.72       0.88       0.72       0.68       0.76      ], Average Recall 0.6714871794871794
CV F1 [0.67857143 0.63492063 0.7037037  0.68       0.59649123 0.74074074
 0.7        0.47058824 0.26415094 0.375      0.35616438 0.42253521
 0.3943662  0.52941176 0.61290323 0.51515152 0.47619048 0.71186441
 0.55555556 0.53125    0.52307692 0.61016949 0.6        0.52459016
 0.55737705 0.58064516 0.61971831 0.57142857 0.53125    0.52777778], Average F1 0.5531864366323949
CV AUC [0.85225885 0.80616606 0.87851038 0.88766789 0.83150183 0.90781441
 0.87728938 0.6978022  0.54945055 0.53113553 0.51587302 0.54578755
 0.55128205 0.76678877 0.7954823  0.72283272 0.65689866 0.84371184
 0.78113553 0.72893773 0.74297924 0.72

In [20]:
# What features of these are most important for ?
modelrf.fit(X, y)

all_feature_names = vectorizer.get_feature_names()
feature_importances = pd.DataFrame({'rfFeatures' : all_feature_names, 'Importance Score': modelrf.feature_importances_})
feature_importances.sort_values('Importance Score', ascending=False).head(10)

Unnamed: 0,rfFeatures,Importance Score
94,google,0.020936
41,cool,0.019669
276,wait,0.01861
28,cars,0.017243
27,car,0.015758
14,awesome,0.012303
217,self,0.012095
166,need,0.011645
63,driving,0.011556
278,want,0.011501


In [26]:
# What features of these are most important to predict negative sentiment for Logistic Regression?
modellr.fit(X, y)

all_feature_names = vectorizer.get_feature_names()
feature_importances = pd.DataFrame({'lrFeatures' : all_feature_names, 'Importance Score': modellr.coef_[0].tolist()})
feature_importances.sort_values('Importance Score', ascending=False).head(20)

Unnamed: 0,lrFeatures,Importance Score
128,jobs,2.351344
53,doesn,1.993579
97,government,1.851629
132,law,1.56326
55,dont,1.532249
220,shit,1.459055
182,point,1.449611
7,aren,1.391297
54,don,1.377928
260,trains,1.272943


In [31]:
test_tweet = [df.tweet[456]]
test_tweet

[' Great article on #disabled transportation I will opt 4  @elonmusk 2 deliver my #autonomous car http://t.co/2hJxhTPsQ6']

In [32]:
test_tweet = vectorizer.transform(test_tweet)

In [33]:
modellr.predict(test_tweet)[0]

1

In [197]:
#using tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 300, 
                             ngram_range=(1, 1), 
                             stop_words='english')


# Use `fit` to learn the vocabulary
vectorizer.fit(titles)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles).toarray()

scores = cross_val_score(model, X, y, cv=30,scoring='recall')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

CV AUC [0.30769231 0.11538462 0.30769231 0.42307692 0.46153846 0.30769231
 0.53846154 0.5        0.15384615 0.30769231 0.23076923 0.19230769
 0.07692308 0.34615385 0.23076923 0.30769231 0.23076923 0.19230769
 0.42307692 0.42307692 0.42307692 0.23076923 0.30769231 0.26923077
 0.26923077 0.32       0.44       0.24       0.08       0.48      ], Average AUC 0.30456410256410255


In [16]:
# General:
import tweepy           # To consume Twitter's API
import pandas as pd     # To handle data
import numpy as np      # For number computing

# For plotting and visualization:
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#test aws
import psycopg2
from psycopg2 import sql
import time
## Database details
input_db_name = "osa"
input_db_user = "janaka"
input_db_password = "1qazZXC!23"
input_db_port = 5432
input_db_host = "osa.c97j4f5sor7n.eu-west-1.rds.amazonaws.com"

def create_db_connection(db_name=input_db_name, 
                         db_user=input_db_user,
                         db_password=input_db_password, 
                         db_port=input_db_port,
                         db_host=input_db_host):
    return psycopg2.connect(host=db_host, 
                            database=db_name,
                            user=db_user, 
                            password=db_password,
                            port=db_port)

#machine learning & text 
from nltk.tokenize import WordPunctTokenizer
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression



In [2]:
conn = create_db_connection()
cur = conn.cursor()

In [3]:
cur.execute("""SELECT datname from pg_database""")
rows = cur.fetchall()
print(rows)

[('template0',), ('rdsadmin',), ('template1',), ('postgres',), ('janaka',), ('osa',)]


In [4]:
#create incoming tweets table
drop_table_command = "DROP TABLE incoming_tweets"
cur.execute(drop_table_command)
conn.commit()
#create_table_command = "CREATE TABLE incoming_tweets(id serial PRIMARY KEY, sentiment integer NOT NULL, tweet varchar(500))"
create_table_command = "CREATE TABLE incoming_tweets(id serial PRIMARY KEY, tweet varchar(500))"
cur.execute(create_table_command)
conn.commit()

In [5]:
#create existing tweets table
drop_table_command = "DROP TABLE existing_tweets"
cur.execute(drop_table_command)
conn.commit()
create_table_command = "CREATE TABLE existing_tweets(id serial PRIMARY KEY, tweet varchar(500), sentiment integer NOT NULL)"
cur.execute(create_table_command)
conn.commit()

In [9]:
#training data
df = pd.read_csv('twitter.csv', engine='python')
df['neg_senti']= np.where(df['sentiment']<3, 1, 0)
print(df.head(), df.shape)

   sentiment                                              tweet  neg_senti
0          5  Two places I'd invest all my money if I could:...          0
1          5  Awesome! Google driverless cars will help the ...          0
2          5  Autonomous vehicles could reduce traffic fatal...          0
3          5  Really good presentation from Jan Becker on Bo...          0
4          5  Ford just revealed it's Automated Ford Fusion ...          0 (2664, 3)


In [23]:
#generate dummy tweets for testing
def gettweets():
    
    top5=pd.read_csv('twitter.csv', engine='python').head()
    
    conn = create_db_connection()
    cur = conn.cursor()
    for i in range(len(top5)):
        sql_insert = '''insert into incoming_tweets (tweet)
        values ('{0}') 
        '''.format(top5.tweet[i].replace("'",""))
        cur.execute(sql_insert)
        conn.commit()
    conn.close()
    
    print('Dummy Incoming tweet table created')

gettweets()

Dummy Incoming tweet table created


In [24]:
#convert tweets from postgres to pandas dataframe
def topddf():
    
    conn = create_db_connection()
    cur = conn.cursor()
    cur.execute("""SELECT * from incoming_tweets limit 2""")
    output = cur.fetchall() 
    conn.close()
    
    d = []
    for i in range(len(output)):
        d.append((output[i][1]))

    df = pd.DataFrame(d, columns=['tweet'])
    
    return df

df_test = topddf()
len(df_test)

2

In [11]:
#tweet cleaner 2
from nltk.tokenize import WordPunctTokenizer
from bs4 import BeautifulSoup
import re
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))   

def tweet_cleaner(text):
        
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    
    refined_words=[]
    for word in words:
        if len(word)>2:
            refined_words.append(word)
            
    return (" ".join(refined_words)).strip()

testing = df.tweet
test_result = []
for t in testing:
    test_result.append(tweet_cleaner(t))
test_result

['two places invest all money could printing and self driving cars',
 'awesome google driverless cars will help the blind travel more often',
 'autonomous vehicles could reduce traffic fatalities',
 'really good presentation from jan becker bosch automated vehicle research autoauto check out',
 'ford just revealed automated ford fusion hybrid vehicle pretty amazing fordtrends ford test',
 'yeah just throwing this out there again would totally down beta test autonomous car',
 'musk reluctant partner with apple google but android controlled autonomous smart car would awesome',
 'finished drive now rush hour for meeting cant wait for autonomous google car',
 'the google autonomous car paid visit nvidia pretty cool technology',
 'finally very realistic timeline for full autonomous car capability hats off autoforum',
 'cool when cars are fully autonomous cause totally gonna sleep while car drives',
 'autonomous car driving could free the billion that wasted car insurance',
 'awesome google 

In [18]:
df['cleaned_tweet'] = np.array([ tweet_cleaner(tweet) for tweet in df.tweet ])
titles = df['cleaned_tweet'].fillna('')

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features = 300, 
                             ngram_range=(1, 1), 
                             stop_words='english',
                             binary=True)

# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(titles)

# Use `tranform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles)

In [19]:
def trained_model(model,X,y):
    
    #scoring results
    scoresR = cross_val_score(model, X, y, cv=30, scoring='recall')
    print('CV Recall {}, Average Recall {}'.format(scoresR, scoresR.mean()))
    # F1 = (2 x recall x precision) / (recall + precision)
    scoresf1 = cross_val_score(model, X, y, cv=30, scoring='f1')
    print('CV F1 {}, Average F1 {}'.format(scoresf1, scoresf1.mean()))
    scoresAUC = cross_val_score(model, X, y, cv=30, scoring='roc_auc')
    print('CV AUC {}, Average AUC {}'.format(scoresAUC, scoresAUC.mean()))
    
    #output trained model
    trained_model = model.fit(X, y)
    
    return trained_model

In [20]:
# select model: logistic regression
from sklearn.linear_model import LogisticRegression
modellr = LogisticRegression(class_weight= 'balanced') 

#define model, X and y
model = modellr
X = CVectorizer(df)
y = df.neg_senti
df.head()

Unnamed: 0,sentiment,tweet,neg_senti,cleaned_tweet
0,5,Two places I'd invest all my money if I could:...,0,two places invest all money could printing and...
1,5,Awesome! Google driverless cars will help the ...,0,awesome google driverless cars will help the b...
2,5,Autonomous vehicles could reduce traffic fatal...,0,autonomous vehicles could reduce traffic fatal...
3,5,Really good presentation from Jan Becker on Bo...,0,really good presentation from jan becker bosch...
4,5,Ford just revealed it's Automated Ford Fusion ...,0,ford just revealed automated ford fusion hybri...


In [21]:
Output_model = trained_model(model,X,y)

#feature importance
all_feature_names = vectorizer.get_feature_names()
feature_importances = pd.DataFrame({'lrFeatures' : all_feature_names, 'Importance Score': Output_model.coef_[0].tolist()})
feature_importances.sort_values('Importance Score', ascending=False).head(5)

CV Recall [0.73076923 0.76923077 0.73076923 0.65384615 0.65384615 0.76923077
 0.80769231 0.61538462 0.26923077 0.46153846 0.5        0.57692308
 0.53846154 0.69230769 0.73076923 0.65384615 0.57692308 0.80769231
 0.76923077 0.65384615 0.65384615 0.69230769 0.80769231 0.61538462
 0.65384615 0.72       0.88       0.72       0.68       0.76      ], Average Recall 0.6714871794871794
CV F1 [0.67857143 0.63492063 0.7037037  0.68       0.59649123 0.74074074
 0.7        0.47058824 0.26415094 0.375      0.35616438 0.42253521
 0.3943662  0.52941176 0.61290323 0.51515152 0.47619048 0.71186441
 0.55555556 0.53125    0.52307692 0.61016949 0.6        0.52459016
 0.55737705 0.58064516 0.61971831 0.57142857 0.53125    0.52777778], Average F1 0.5531864366323949
CV AUC [0.85225885 0.80616606 0.87851038 0.88766789 0.83150183 0.90781441
 0.87728938 0.6978022  0.54945055 0.53113553 0.51587302 0.54578755
 0.55128205 0.76678877 0.7954823  0.72283272 0.65689866 0.84371184
 0.78113553 0.72893773 0.74297924 0.72

Unnamed: 0,lrFeatures,Importance Score
128,jobs,2.351344
53,doesn,1.993579
97,government,1.851629
132,law,1.56326
55,dont,1.532249


In [25]:
#predict on incoming tweets
test_tweet = vectorizer.transform(df_test.tweet)
df_test['Predicted_Senti'] = Output_model.predict(test_tweet).tolist()
df_test

Unnamed: 0,tweet,Predicted_Senti
0,Two places Id invest all my money if I could: ...,1
1,Awesome! Google driverless cars will help the ...,0


In [26]:
def updatetweets(df_test):
    
    conn = create_db_connection()
    cur = conn.cursor()

    for i in range(len(df_test)):
        sql_insert = '''insert into existing_tweets (tweet, sentiment)
        values ('{0}', '{1}') 
        '''.format(df_test.tweet[i].replace("'",""), df_test.Predicted_Senti[i])
        cur.execute(sql_insert)
        conn.commit()

    conn.close()
    
    print('New tweet sentiment pairs added to existing_tweets table')

updatetweets(df_test)

New tweet sentiment pairs added to existing_tweets table


In [27]:
def deletetweets(table_name):  
    
    conn = create_db_connection()
    cur = conn.cursor()
    cur.execute(
        sql.SQL("DELETE FROM {}")
            .format(sql.Identifier(table_name)))
    conn.commit()
    conn.close()
    
    print('All records were deleted from table: %s'%(table_name))

deletetweets('incoming_tweets')
deletetweets('existing_tweets')

All records were deleted from table: incoming_tweets
All records were deleted from table: existing_tweets


In [30]:
#simulate demo

def demo():
    #generate random tweets
    gettweets()
    #set dummy condition
    condition = 1
    
    while condition != 0:
        
        #convert osa incoming tweets table to pandas df
        df_test = topddf()
        
        #add predicted sentiment column
        test_tweet = vectorizer.transform(df_test.tweet)
        df_test['Predicted_Senti'] = Output_model.predict(test_tweet).tolist()
        print(df_test.head())
        
        #update new records into osa existing tweets table
        updatetweets(df_test)
        
        #set condition to stop while loop
        deletetweets('incoming_tweets')
        
        conn = create_db_connection()
        cur = conn.cursor()
        cur.execute("""SELECT * from incoming_tweets limit 1""")
        condition = cur.fetchone() 
        conn.close()

        if condition is None:
            condition = 0  

In [29]:
# test demo
demo() 

Dummy Incoming tweet table created
                                               tweet  Predicted_Senti
0  Two places Id invest all my money if I could: ...                1
1  Awesome! Google driverless cars will help the ...                0
New tweet sentiment pairs added to existing_tweets table
All records were deleted from table: incoming_tweets


In [12]:
import psycopg2
from pprint import pprint

In [27]:
class DatabaseConnection:
    def __init__(self):
        try:
            self.connection = psycopg2.connect(
                "dbname='osa' user='janaka' host='osa.c97j4f5sor7n.eu-west-1.rds.amazonaws.com' password='1qazZXC!23' port='5432'")
            self.connection.autocommit = True
            self.cursor = self.connection.cursor()
        except:
            pprint("Cannot connect to datase")

    def create_table(self):
        create_table_command = "CREATE TABLE pet(id serial PRIMARY KEY, name varchar(100), age integer NOT NULL)"
        self.cursor.execute(create_table_command)

    def insert_new_record(self):
        new_record = ("misa meo6", "6")
        insert_command = "INSERT INTO pet(name, age) VALUES('" + new_record[0] + "','" + new_record[1] + "')"
        pprint(insert_command)
        self.cursor.execute(insert_command)
        
    def query_all(self):
        self.cursor.execute("SELECT * FROM pet")
        cats = self.cursor.fetchall()
        for cat in cats:
            pprint("each pet : {0}".format(cat))

    def update_record(self):
        update_command = "UPDATE pet SET age=10 WHERE id=1"
        self.cursor.execute(update_command)

    def drop_table(self):
        drop_table_command = "DROP TABLE pet"
        self.cursor.execute(drop_table_command)

In [28]:
if __name__== '__main__':
    database_connection = DatabaseConnection()
    #database_connection.create_table()
    #database_connection.insert_new_record()
    #database_connection.query_all()
    #database_connection.update_record()
    database_connection.drop_table()