# Tweet Feature Extraction

In [23]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from collections import Counter

In [24]:
tweets_csv_data = pd.read_csv("tweets.csv")

In [25]:
Counter(tweets_csv_data.Series)

Counter({'_Series_A': 91071,
         nan: 113,
         '_Series_B': 173305,
         '_Series_C': 90223,
         '_Series_D': 33135})

In [26]:
def get_num_first(text):
    try:
        return int(text.split()[0])
    except:
        return 0

In [27]:
likes = tweets_csv_data.apply(lambda row: get_num_first(row[2]), axis=1)
retweets = tweets_csv_data.apply(lambda row: get_num_first(row[4]), axis=1)

In [28]:
from datetime import datetime
def get_date(text):
    try:
        return datetime.date(datetime.strptime(text, '%d %b %Y'))
    except:
        return 0

In [29]:
dates = tweets_csv_data.apply(lambda row: get_date(row[7]), axis=1)

In [30]:
tweets_df = pd.DataFrame({    
                  'company_name': tweets_csv_data['Company'], 
                  'tweeter_href': tweets_csv_data['Href'],
                  'likes'       : likes,
                  'tweeter_name': tweets_csv_data['Name'],
                  'retweets'    : retweets,
                  'Series'      : tweets_csv_data['Series'],
                  'text'        : tweets_csv_data['Text'],
                  'date'        : dates
             })

In [31]:
len(tweets_df.company_name.unique())

2202

In [32]:
tweets_df = tweets_df[tweets_df['text'].notnull()]
tweets_df = tweets_df[tweets_df['date'] != 0] 

In [33]:
tweets_df['text_length'] = tweets_df['text'].apply(lambda text: len(text))

In [34]:
tweet_count = tweets_df.groupby(['company_name','Series']).likes.count()

In [35]:
def get_tags(text):
    persontags = []
    hashtags = []
    links = []
    wordlist = text.split()
    for word in wordlist:
        if len(word) >= 2 and word[0] == '@':
            persontags.append(word)
        if len(word) >= 2 and word[0] == '#':
            hashtags.append(word)
        if len(word) >= 5 and word.startswith('http'):
            links.append(word)
    return {'persontags': persontags, 'hashtags': hashtags, 'links': links}

In [36]:
tweets_df['persontags'] = tweets_df['text'].apply(lambda text: get_tags(text)['persontags'])
tweets_df['hashtags'] = tweets_df['text'].apply(lambda text: get_tags(text)['hashtags'])
tweets_df['links'] = tweets_df['text'].apply(lambda text: get_tags(text)['links'])
tweets_df['persontags_count'] = tweets_df['text'].apply(lambda text: len(get_tags(text)['persontags']))
tweets_df['hashtags_count'] = tweets_df['text'].apply(lambda text: len(get_tags(text)['hashtags']))
tweets_df['links_count'] = tweets_df['text'].apply(lambda text: len(get_tags(text)['links']))

In [37]:
likes_mean = tweets_df.groupby(['company_name','Series']).likes.mean()
retweets_mean = tweets_df.groupby(['company_name','Series']).retweets.mean()

In [38]:
tweets_df = tweets_df[tweets_df.text.notnull()]
tweets_df = tweets_df[tweets_df.date != 0]

In [39]:
tweets_df['text_length'] = [len(text) for text in tweets_df.text]

In [40]:
mean_groupby = tweets_df.groupby(['company_name','Series']).mean()

In [41]:
count_groupby = tweets_df.groupby(['company_name','Series']).likes.count()

In [42]:
count_groupby = pd.DataFrame(count_groupby)
count_groupby.columns = ['count']

In [43]:
count = tweets_df.groupby(['company_name','Series']).count()['date']

In [44]:
funding_csv = pd.read_csv('funding.csv')
funding_csv.head()

Unnamed: 0,Description,Market,Names,No_Stage_Amount,No_Stage_Date,Pitch,Seed_Amount,Seed_Date,Series_A_Amount,Series_A_Date,Series_B_Amount,Series_B_Date,Series_C_Amount,Series_C_Date,Series_D_Amount,Series_D_Date,Stage
0,,Cable,Epic-Sciences,,,,,,,,"$13,000,000","Nov 13, 2012","$30,000,000","Jul 30, 2014",,,Series C
1,,All Students,Apreso-Classroom,,,,,,,,"$15,000,000","Oct 14, 2008",,,,,Series B
2,Visualead (视觉码) creates better interactions be...,Bridging Online and Offline,Visualead,,,Effective and Secure Offline to Mobile experie...,"$750,000","Mar 25, 2012","$1,600,000","Aug 15, 2013",Unknown,"Jan 20, 2015",,,,,Series B
3,,Food Processing,Onshift,"$7,000,000","Feb 3, 2014",,,,,,"$3,000,000","Feb 2, 2012",,,,,Series C
4,,-,Xendex-Holding,,,,,,Unknown,"Jun 25, 2008","$3,500,000","Nov 30, 2009",,,,,Series A
