# Data Wrangling

## Dataset - WeRateDogs&trade; Twitter Archive

***By: Kartik Nanduri***<br>
**Date: 21st Nov, 2018.**

## Let's Gather

In [1]:
# importing all the necessary libraries
import os
import pandas as pd
import requests as req

***<span style="color: red">Important! uncomment the following files to run the book with out errors.</span>***

In [None]:
# resetting the folder structure.
#os.rename('dataset/twitter-archive-enhanced.csv', 'twitter-archive-enhanced.csv')
#import shutil
#shutil.rmtree('dataset')

***<span style="color: green">Important! once done, please recomment.</span>***

1. [x] **The file given at hand `twitter-archive-enhanced.csv`**

In [2]:
# all the requried files for this project are in the list files_list
files_list = ['twitter-archive-enhanced.csv', 'image-predictions.tsv', 'tweet_json.txt']

In [None]:
# reading the twitter archive file
archive = pd.read_csv(files_list[0])

# taking at random entries for the archive file
archive.shape

2. [x] **Fetching the data from url and saving it to local drive - `image-predictions.tsv`**

In [None]:
# reading the file from internet using the requests library
url = "https://s3.amazonaws.com/video.udacity-data.com/topher/2018/November/5bf60c69_image-predictions-3/image-predictions-3.tsv"
res = req.get(url)

with open(files_list[1], mode = "wb") as op_file:
    op_file.write(res.content)

In [None]:
# checking if fetched the data right way
img_pre_test = pd.read_csv(files_list[1], delimiter = "\t", encoding = 'utf-8')
img_pre_test.sample(2)

# we did it the right way, Yay! it worked.

3. [x] **Getting data from Twitter&trade;**

In [None]:
# importing all the necessary libraries for accessing Twitter via API
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

In [None]:
# setting up all the necessary placeholders for API
consumer_key = 'xxx.xxx.xxx.xxx'
consumer_secret = 'xxx.xxx.xxx.xxx'
access_token = 'xxx.xxx.xxx.xxx'
access_secret = 'xxx.xxx.xxx.xxx'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth_handler = auth,
                 parser = tweepy.parsers.JSONParser(),
                 wait_on_rate_limit = True, 
                 wait_on_rate_limit_notify = True)

In [None]:
def fetch_and_save(ids, api_ins, one_id = None):
    '''
    This function will fetch data with associated id in ids list
    ids (List Object): a list all tweets
    api_ins (Tweepy Object): api object instance, will be used to query twitter for data
    one_id (int): use when you want to query only for one tweet
    failed_ids (List Object): a list will be retured so that, this fuction can be called once again on those ids
    '''
    new_file_name = ''; failed_ids = []; tweet_df = []
    
    # checking if file exists
    if os.path.exists(files_list[2]):
        temp = [s for s in os.listdir() if "tweet_json" in s]
        new_file_name = files_list[2].split('.')[0] + "_" + str(len(temp)) + ".txt"
    else:
        new_file_name = files_list[2]
    
    # querying a list of ids
    if one_id == None:
        with open(new_file_name, mode = 'w') as outfile:
            for one_id in ids:
                try:
                    content = api_ins.get_status(one_id, tweet_mode='extended')
                    json.dump(content, outfile)
                    outfile.write('\n')
                
                except Exception as e:
                    print("Error for: " + str(one_id) + " - " + str(e))
                    failed_ids.append(one_id)
    
    # querying a single id
    else:
        try:
            content = api_ins.get_status(one_id, include_entities = True)
            favorites = content['favorite_count']
            retweets = content['retweet_count']
            
            tweet_df.append({'tweet_id': int(one_id),
                        'favorites': int(favorites),
                        'retweets': int(retweets)})
            
            return tweet_df
                           
        except Exception as e:
            print("Error for: " + str(one_id) + " - " + str(e))
            failed_ids.append(one_id)

    return failed_ids

In [None]:
# passing the list of ids to the fuction fetch_and_save(), but in batches
# given that we can request 900 request/15min - window, let's break our ids into
tweet_ids = archive['tweet_id'].tolist()

# set_one, two and three
set_one = tweet_ids[0:900]; set_two = tweet_ids[900:1800]; set_three = tweet_ids[1800:]

# checking the lengths so that we send 900 ids/requests.
print(len(set_one), len(set_two), len(set_three), len(set_one)+len(set_two)+len(set_three))
print(len(set_one)+len(set_two)+len(set_three) == len(tweet_ids))

In [None]:
import time
# fetching data 1st iteration
# starting the timer
start = timer()

# querying
test_one = fetch_and_save(set_one, api)

# ending the timer
end = timer()

# calculating the runtime for fetch_and_save
print("That took about {} mins.".format(round((end - start)/60, 1)))

In [None]:
# no of erroneous ids
print("we have about {} failed requests.".format(len(test_one)))

In [None]:
# sleeping for 6 mins, so that Rate Limit time is reduced
print("Sleeping for 6 mins.")
time.sleep(360)
print("Done sleeping")

# fetching data 2nd iteration
# starting the timer
start = timer()

# querying
test_two = fetch_and_save(set_two, api)

# ending the timer
end = timer()

# calculating the runtime for fetch_and_save
print("That took about {} mins.".format(round((end - start)/60, 1)))

In [None]:
# no of erroneous ids
print("we have about {} failed requests.".format(len(test_two)))

In [None]:
# sleeping for 6 mins, so that Rate Limit time is reduced
print("Sleeping for 6 mins.")
time.sleep(360)
print("Done sleeping")

# fetching data 3rd iteration
# starting the timer
start = timer()

# querying
test_three = fetch_and_save(set_three, api)

# ending the timer
end = timer()

# calculating the runtime for fetch_and_save
print("That took about {} mins.".format(round((end - start)/60, 1)))

In [None]:
# no of erroneous ids
print("we have about {} failed requests.".format(len(test_three)))

In [None]:
# lets save the failed ids into one master list
failed_ids = test_one + test_two + test_three
print("Total failed request are: {}. \n".format(len(failed_ids)))

# ids that failed and the ones that passed
indi_fail = []; success = []

#for each failed id, lets try to fetch status individually.
for failed_id in failed_ids:
    temp = fetch_and_save(ids = None, api_ins = api, one_id = failed_id)
    indi_fail.append(temp[0])

# removing empty elements from list
success = [x for x in indi_fail if not isinstance(x, (int))]
indi_fail = [x for x in indi_fail if isinstance(x, (int))]

# checking if there is change
print("\nWe were able to retrieve {} records, others failed.".format(len(failed_ids) - len(indi_fail)))

4. [x] **Okay, let's combine the successful jsons into one file, called the `tweet_master.txt`**

In [None]:
# combing all successful jsons into one master file
json_1 = pd.read_json('tweet_json.txt', lines = True, encoding = 'utf-8')
json_2 = pd.read_json('tweet_json_1.txt', lines = True, encoding = 'utf-8')
json_3 = pd.read_json('tweet_json_2.txt', lines = True, encoding = 'utf-8')

# total rows that we need to have in our resulting dataframe
print(json_1.shape[0] + json_2.shape[0] + json_3.shape[0])

In [None]:
json_master = pd.concat([json_1, json_2, json_3], ignore_index = True, join = 'outer', sort = True)
json_master.to_json('tweet_master.txt', orient = 'records', lines = True)
json_master.shape

In [None]:
# removing objects that are not required.
del archive, img_pre_test
del json_1, json_2, json_3, json_master
del indi_fail, end, start, test_one, test_two, test_three, set_one, set_two, set_three
del consumer_key, consumer_secret, access_token, access_secret, auth, api

# we are not removing success and files_list, making sure we stick to good programming practices - reusablity.

5. [x] **Last thing to do is to tidy up our folder, let's get going.**

In [3]:
# moving all data files under one folder - dataset
# removing the temporary files, that acted as placeholders

# creating the folder
folder = 'dataset'
if not os.path.exists(folder):
    os.mkdir(folder)

# we know that our master datasets for this project are
# 1. twitter-archive-enhanced.csv
# 2. image-predictions.tsv
# 3. tweet_json_master.txt
# let us move these files

# updating our files_list
files_list[-1] = 'tweet_master.txt'

# moving only required files
for file in files_list:
    if os.path.exists(file):
        os.rename(file, folder+'/'+file)

# removing the tweet_json and tweet_json_1 files as they are not required anymore
for file in [s for s in os.listdir() if "tweet_json" in s]:
    if os.path.exists(file):
        os.remove(file)
    
# lisitng the current directory
os.listdir()

# clean and neat, lets get with assessing and cleaning

['.git',
 '.ipynb_checkpoints',
 'aabcd.csv',
 'dataset',
 'error.png',
 'New Text Document.txt',
 'README.md',
 'test.py',
 'twitter-archive-enhanced.xlsx',
 'twitter_text.csv',
 'wrangle_act.ipynb']

In [4]:
# renaming files_list
for i in range(3):
    files_list[i] = folder + '/'+ files_list[i]

5. [x] **Last thing to do is to extract `retweet_count` and `favourite_count` from `tweet_master.txt`, saving the result as .csv**

In [None]:
# appending success to the master dataset.
tweet_json = pd.read_json(files_list[2], lines = True, encoding = 'utf-8')
tweet_json = tweet_json[['id', 'retweet_count','favorite_count']]
tweet_json.rename(index = str,
                  columns={'id' : 'tweet_id', 'retweet_count': 'retweets','favorite_count': 'favorites'},
                  inplace = True)
tweet_json = pd.concat([tweet_json, pd.DataFrame.from_dict(success)],
                       ignore_index = True, sort = True)
tweet_json.shape

In [None]:
# saving the dataframe into master file
tweet_json.to_json(files_list[2], orient = 'records', lines = True)

## Summary - Gathering

- We know, that gathering is a the first step in wrangling.
- We were successful in gathering from three different sources with different techniques:
    - Data given at hand.
    - Fetch from flat file stored on a server.
    - From API.

- There a total of 14 missing data points, tried a different ways for retrieving them, using the API as well as `twurl` of the `Ruby` package, but they were not to be found, as stated below in the highlighted section.

***<span style="color: ##6c6cff">So let's start with assessing the data.</span>***

![error](error.png)

## Assessing

In [5]:
# let's load up dataset, and starting assessing them.
archive =  pd.read_csv(files_list[0], encoding = 'utf-8')
img_pre = pd.read_csv(files_list[1], sep = '\t', encoding = 'utf-8')
retweets_fav = pd.read_json(files_list[2], lines = True, encoding =  'utf-8')

### Issues to sort!

In [None]:
# printing out archive - visual assessment
archive

In [None]:
# Programmatic Assessment 1 - Information
archive.info()

In [None]:
# Programmtic Assessment 2 - Describe
archive.describe()

In [None]:
# checking for duplicates - tweet_ids
sum(archive.tweet_id.duplicated())

In [None]:
# checkin if we have more than one class of dogs assigned to dog
# the following are the only combinations that are present in the dataset
cond_1 = (archive['doggo'] == 'doggo') & (archive['floofer'] == 'floofer')
cond_2 = (archive['doggo'] == 'doggo') & (archive['pupper'] == 'pupper')
cond_3 = (archive['doggo'] == 'doggo') & (archive['puppo'] == 'puppo')

# printing these entries
archive[cond_1 | cond_2 | cond_3][['tweet_id', 'text', 'doggo', 'floofer', 'pupper', 'puppo']]

In [None]:
len(archive[cond_1 | cond_2 | cond_3])

1. **`twitter-archive-enhanced.csv`** table

***1 Content Issues:***

**1.1 Visual Assessment:**
- `rating_numerator` : has values such as 1, 3.. e.t.c - **Data Quality Dimension - `Consistency`**.
- `rating_denominator` : have values, less than 10, for example, the tweet_id - 666287406224695296 has the number 2 as its value - **Data Quality Dimension - `Consistency`**. 
- We see that, Articles - `a`, `an`, `the` have been used to name dogs, as well as words such as `such`, `quite` - **Data Quality Dimension - `Validity`**.
- There are instances where the names of dogs are in lowercases - **Data Quality Dimension - `Consistency`**.

**1.2 Programmatic Assessment:**
- `rating_numerator` : has a maximum value of 1766 - **Data Quality Dimension - `Consistency`**. 
- `rating_denominator` : has a maximum value of 170 - **Data Quality Dimension - `Consistency`**.
- All in all, this dataset appears to be clean, except for `expanded_url` - we have about 59 instances missing - **Data Quality Dimension - `Completeness`**.
- We can see that there are more than one class assigned to tweets, analyze and assign proper dog class so that melting is easy - **Data Quality Dimension - `Consistency`**.

***2 Structural Issues:***

**2.1 Visual Assessment:**
- we can see that, there are four classes of dogs `doggo`, `floofer`, `puppo`, `pupper`; these should a part of one unit - `dog_class` - **Data Quality Dimension - `Consistency`**.

**2.2 Programmatic Assessment:**
- `in_reply_to_status_id`, `retweeted_status_id`, `retweeted_status_user_id`, `in_reply_to_user_id` of type float64 must be converted into int - **Data Quality Dimension - `Validity`**.
- `timestamp`, `retweeted_status_timestamp` of type object must be converted into datatime - **Data Quality Dimension - `Validity`**.

In [None]:
# assessing img_predictions dataset
img_pre

In [None]:
# Programmatic Assessment - Information
img_pre.info()

In [None]:
# checking for duplicates
img_pre[img_pre['jpg_url'].duplicated(keep = False)].sort_values(by = 'jpg_url')[['tweet_id', 'jpg_url']]

2. **`image-predictions.tsv`** table

***1 Content Issues:***

**1.1 Visual Assessment:**
- We have few dog breeds that are represented in lowercase.

**1.2 Programmatic Assessment:**
- We have about 281 images on a whole, that are missing with respect to our `twitter-archive-enhanced.csv` file - **Data Quality Dimension - `Completeness`**.
- We can see that, we have about `66` duplicates **OR** a pair of tweets are pointing to same *`jpg_url`* - **Data Quality Dimension - `Accuracy`**.

***2 Structural Issues:***

**2.1 Visual Assessment:**
- None. 

**2.2 Programmatic Assessment:**
- None.

In [None]:
# assessing tweet_master dataset
retweets_fav

In [None]:
retweets_fav.info()

3. **`tweet_master.txt`** table

***1 Content Issues:***

**1.1 Visual Assessment:**
- None.

**1.2 Programmatic Assessment:**
- We have about 14 missing records - **Data Quality Dimension - `Completeness`**.

***2 Structural Issues:***

**2.1 Visual Assessment:**
- None.

**2.2 Programmatic Assessment:**
- None.

## Summary - Assessing

- Completed the second step.
- The following are the insights:
    - from `twitter-archive-enhanced.csv` datset, the rating_numerator and denominator need to be fixed.
    - the dataset also represents row values as columns, which needs to be fixed.
    - the dataset also has structural issues such as wrong datatype assigned to a column.
    - from `images-preductions.tsv` dataset, there is consistency issue with naming dog breeds.
    - the dataset isn't complete when compared to `twitter-archive-enhanced.csv`, we have about 281 missing tweets.
    - Also we have `jpg_urls'` that are pointing to a pair of tweets.
    - `tweet_master.txt` dataset has about 14 missing records.
    - the dataset alone hold the information about retweets and favourites - bad form of schema normalization.

## Cleaning

#### Define
- Important!, before we get to cleaning, let's drop rows from image-predictions, that are false in dog_1,_2 and _3, as they are not related to our dataset.

#### Code

In [6]:
# only select those rows that are either true or false and not all false
img_pre = img_pre[~((img_pre.p1_dog == False) & (img_pre.p2_dog == False) & (img_pre.p3_dog == False))]

####  Test

In [7]:
# asserting the lenght to be 0
assert len(img_pre[(img_pre.p1_dog == False) & (img_pre.p2_dog == False) & (img_pre.p3_dog == False)]) == 0, "Check"

In [8]:
# the master dataset
master_set = archive.merge(img_pre, how = 'left', on = ['tweet_id'])
master_set = master_set.merge(retweets_fav, how = 'left', on = ['tweet_id'])
files_list.append('dataset/master_set_raw.csv')
master_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2356 entries, 0 to 2355
Data columns (total 30 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
jpg_url                       17

In [9]:
# saving the file to local disk.
master_set.to_csv(files_list[3], index = False)

In [10]:
# creating a copy of the master set
master_set = pd.read_csv(files_list[3], encoding = 'utf-8')
master_copy = master_set.copy()

### Issues to Clean.

#### 1. Basic cleaning.

#### Define
- Assign proper class for the above 14 tweets before melting.
- Delete *retweets* with *any duplicates* and get rid of *tweets with **no** images*.
- Once done, drop the following columns:
    1. `retweeted_status_id`
    2. `retweeted_status_user_id`
    3. `retweeted_status_timestamp`
    4. `in_reply_to_status_id`
    5. `in_reply_to_user_id`
    
#### Code

In [11]:
# setting column width to -1
pd.set_option('display.max_colwidth', -1)
cond_1 = (master_copy['doggo'] == 'doggo') & (master_copy['floofer'] == 'floofer')
cond_2 = (master_copy['doggo'] == 'doggo') & (master_copy['pupper'] == 'pupper')
cond_3 = (master_copy['doggo'] == 'doggo') & (master_copy['puppo'] == 'puppo')
print(master_copy[cond_1 | cond_2 | cond_3][['tweet_id', 'text']])

                tweet_id  \
191   855851453814013952   
200   854010172552949760   
460   817777686764523521   
531   808106460588765185   
565   802265048156610565   
575   801115127852503040   
705   785639753186217984   
733   781308096455073793   
778   775898661951791106   
822   770093767776997377   
889   759793422261743616   
956   751583847268179968   
1063  741067306818797568   
1113  733109485275860992   

                                                                                                                                                                      text  
191   Here's a puppo participating in the #ScienceMarch. Cleverly disguising her own doggo agenda. 13/10 would keep the planet habitable for https://t.co/cMhq16isel        
200   At first I thought this was a shy doggo, but it's actually a Rare Canadian Floofer Owl. Amateurs would confuse the two. 11/10 only send dogs https://t.co/TXdT3tmuYk  
460   This is Dido. She's playing the lead role in "Pupper S

***Assign the following:***
1. 855851453814013952: puppo
2. 854010172552949760: floofer
3. 817777686764523521: pupper
4. 808106460588765185: pupper
5. 802265048156610565: pupper
6. 801115127852503040: pupper
7. 785639753186217984: pupper
8. 781308096455073793: pupper
9. 775898661951791106: pupper
10. 770093767776997377: pupper
11. 759793422261743616: pupper
12. 751583847268179968: doggo
13. 741067306818797568: doggo
14. 733109485275860992: doggo

**<span style="color: green">I like puppies, so for most of the entries it is pupper!</span>**

In [12]:
# assigning values.
master_copy.loc[master_copy['tweet_id'] == 855851453814013952, ['doggo', 'floofer', 'pupper']] = 'None'
master_copy.loc[master_copy['tweet_id'] == 854010172552949760, ['doggo', 'pupper', 'puppo']] = 'None'
master_copy.loc[master_copy['tweet_id'] == 817777686764523521, ['doggo', 'floofer', 'puppo']] = 'None'
master_copy.loc[master_copy['tweet_id'] == 808106460588765185, ['doggo', 'floofer', 'puppo']] = 'None'
master_copy.loc[master_copy['tweet_id'] == 802265048156610565, ['doggo', 'floofer', 'puppo']] = 'None'
master_copy.loc[master_copy['tweet_id'] == 801115127852503040, ['doggo', 'floofer', 'puppo']] = 'None'
master_copy.loc[master_copy['tweet_id'] == 785639753186217984, ['doggo', 'floofer', 'puppo']] = 'None'
master_copy.loc[master_copy['tweet_id'] == 781308096455073793, ['doggo', 'floofer', 'puppo']] = 'None'
master_copy.loc[master_copy['tweet_id'] == 775898661951791106, ['doggo', 'floofer', 'puppo']] = 'None'
master_copy.loc[master_copy['tweet_id'] == 770093767776997377, ['doggo', 'floofer', 'puppo']] = 'None'
master_copy.loc[master_copy['tweet_id'] == 759793422261743616, ['doggo', 'floofer', 'puppo']] = 'None'
master_copy.loc[master_copy['tweet_id'] == 751583847268179968, ['pupper', 'floofer', 'puppo']] = 'None'
master_copy.loc[master_copy['tweet_id'] == 741067306818797568, ['pupper', 'floofer', 'puppo']] = 'None'
master_copy.loc[master_copy['tweet_id'] == 733109485275860992, ['pupper', 'floofer', 'puppo']] = 'None'

####  Test - 1

In [13]:
# all values have been properly assigned
pd.set_option('display.max_colwidth', 50)
master_copy[cond_1 | cond_2 | cond_3][['tweet_id', 'doggo', 'floofer', 'pupper', 'puppo']]

Unnamed: 0,tweet_id,doggo,floofer,pupper,puppo
191,855851453814013952,,,,puppo
200,854010172552949760,,floofer,,
460,817777686764523521,,,pupper,
531,808106460588765185,,,pupper,
565,802265048156610565,,,pupper,
575,801115127852503040,,,pupper,
705,785639753186217984,,,pupper,
733,781308096455073793,,,pupper,
778,775898661951791106,,,pupper,
822,770093767776997377,,,pupper,


In [14]:
# selecting those, tweets that have no retweets
master_copy = master_copy[pd.isnull(master_copy['retweeted_status_id'])]

# deleting duplicates if any
master_copy = master_copy.drop_duplicates()

# deleting those tweets with no images.
master_copy = master_copy.dropna(subset = ['jpg_url'])

# reseting index
master_copy.reset_index(drop=True, inplace=True)

# droping columns
master_copy = master_copy.drop(labels = ['retweeted_status_id',
                                         'retweeted_status_user_id',
                                         'retweeted_status_timestamp',
                                         'in_reply_to_status_id', 
                                         'in_reply_to_user_id'],
                               axis = 1)

#### Test - 2

In [15]:
# after droping the columns, we should have about 25 dimensions/columns
master_copy.shape

(1686, 25)

#### 2. Condense wide-format to long-format

#### Define
- Condense `doggo`, `floofer`, `pupper`, `puppo` as `dog_class`.

#### Code

In [16]:
# to make sure that we have 
doggo = master_copy.doggo.value_counts()['doggo']
floofer = master_copy.floofer.value_counts()['floofer']
pupper = master_copy.pupper.value_counts()['pupper']
puppo = master_copy.puppo.value_counts()['puppo']

# printing count of each class
print("Count of Doggo: {}\nCount of Floofer: {}\nCount of Pupper: {}\nCount of Puppo: {}".format(doggo,
                                                                                                 floofer,
                                                                                                 pupper,
                                                                                                 puppo))

Count of Doggo: 57
Count of Floofer: 8
Count of Pupper: 173
Count of Puppo: 22


In [17]:
# selecting the columns that are to be melted
columns_to_melt = ['doggo', 'floofer', 'pupper', 'puppo']
columns_to_stay = [x for x in master_copy.columns.tolist() if x not in columns_to_melt]

# melting the the columns into values
master_copy = pd.melt(master_copy, id_vars = columns_to_stay, value_vars = columns_to_melt, 
                         var_name = 'stages', value_name = 'dog_class')

# Delete column 'stages'
master_copy = master_copy.drop('stages', 1)

# dropping duplicates
master_copy = master_copy.sort_values('dog_class').drop_duplicates('tweet_id', keep = 'last')
master_copy.reset_index(drop=True, inplace=True)

#### Test

In [18]:
# let's assert
assert doggo == master_copy.dog_class.value_counts()['doggo'], "Some entries are missing"
assert floofer == master_copy.dog_class.value_counts()['floofer'], "Some entries are missing"
assert pupper == master_copy.dog_class.value_counts()['pupper'], "Some entries are missing"
assert puppo == master_copy.dog_class.value_counts()['puppo'], "Some entries are missing"

#### 3. Fix all inaccurate data.

#### Define
- fix names of dogs.
- fix ratings.
- check source column.

#### Code

In [19]:
# Checking source column
master_copy.source.nunique()

3

**<span style="color: red">Okay! only three values, a categorical variable</span>**

In [20]:
import re
# assiging unique values to source.
master_copy['source'] = master_copy['source'].apply(lambda x: re.findall(r'>(.*)<', x)[0])

####  Test - 1

In [21]:
# taking a look at sample of 5 rows
master_copy.sample(5)[['tweet_id', 'source', 'text']]

Unnamed: 0,tweet_id,source,text
106,879008229531029506,Twitter for iPhone,This is Beau. That is Beau's balloon. He takes...
255,685315239903100929,Twitter for iPhone,I would like everyone to appreciate this pup's...
438,702217446468493312,Twitter for iPhone,"I know it's tempting, but please stop sending ..."
430,700847567345688576,Twitter for iPhone,Meet Crouton. He's a Galapagos Boonwiddle. Has...
899,806219024703037440,Twitter for iPhone,We only rate dogs. Please stop sending in non-...


In [22]:
# fixing names
non_names = master_copy.name.str.islower()
non_names = list(set(master_copy[non_names]['name'].tolist()))
flag = master_copy.name.str.len() == 1 & master_copy.name.str.isupper()
non_names.append(master_copy[flag][['tweet_id', 'name']]['name'].tolist()[0])

In [23]:
# replacing all garbage names with none, once done, we'll use the text field to extract names
for name in master_copy.name:
    if name in non_names:
        master_copy.loc[master_copy['name'] == name, ['name']] = 'None'

In [24]:
# checking if there are any non_names after the operation
assert len(master_copy[(master_copy.name.str.islower()) & (flag)]) == 0, "Check code"

***The following are patterns observed in `text` field, we shall use the :***
- This is [name] ..
- Meet [name] ..
- Say hello to [name] ..
- .. named [name] ..
- .. name is [name] ..

We will treat those cases to get the names from the text of the tweet

In [38]:
# extracting names using regular expression.
dog_names = []

# assigning patterns
pattern_1 = r'(T|t)his\sis\s([^.|,]*)'
pattern_2 = r'Meet\s([^.|,]*)'
pattern_3 = r'Say\shello\sto\s([^.|,]*)'
pattern_4 = r'name\sis\s([^.|,]*)'

# looping through text and extracting names
for text in master_copy['text']:
    # Start with 'This is '
    if re.search(pattern_1, text):
        # if our match has alternate name
        if "(" in re.search(pattern_1, text).group(2):
            dog_names.append(re.search(pattern_1, text).group(2).split()[0])
        # if our match has AKA in it
        elif "AKA" in re.search(pattern_1, text).group(2):
            dog_names.append(re.search(pattern_1, text).group(2).split()[0])
        # if our name has two dogs
        elif '&amp;' in re.search(pattern_1, text).group(2):
            temp = re.search(pattern_1, text).group(2).split()
            if len(temp) == 1:
                dog_names.append(temp[0])
            elif len(temp) == 3:
                dog_names.append(temp[0]+"_"+temp[-1])
            else:
                dog_names.append(temp[0]+"_"+temp[-2])
        elif 'named' in re.search(pattern_1, text).group(2):
            temp = re.search(pattern_1, text).group(2).split()
            dog_names.append(temp[-1])
        # just appending the name
        else:
            dog_names.append(re.search(pattern_1, text).group(2))
    
    # Start with 'Meet '
    elif re.search(pattern_2, text):
        # if our name has two dogs
        if '&amp;' in re.search(pattern_2, text).group(1):
            temp = re.search(pattern_2, text).group(1).split()
            if len(temp) == 1:
                dog_names.append(temp[0])
            elif len(temp) == 3:
                dog_names.append(temp[0]+"_"+temp[-1])
            else:
                dog_names.append(temp[0]+"_"+temp[-2])
        # if our name has alternatives
        elif '(' in re.search(pattern_2, text).group(1):
            dog_names.append(re.search(pattern_2, text).group(1).split()[0])
        # just appending the name
        else:
            dog_names.append(re.search(pattern_2, text).group(1))
    
    # Start with 'Say hello to '
    elif re.search(pattern_3, text):
        # if our match has alternate name
        if '(' in re.search(pattern_3, text).group(1):
            dog_names.append(re.search(pattern_3, text).group(1).split()[0])
        # if our name has two dogs
        elif '&amp;' in re.search(pattern_3, text).group(1):
            temp = re.search(pattern_3, text).group(1).split()
            if len(temp) == 1:
                dog_names.append(temp[0])
            elif len(temp) == 3:
                dog_names.append(temp[0]+"_"+temp[-1])
            else:
                dog_names.append(temp[0]+"_"+temp[-2])
        else:
            dog_names.append(re.search(pattern_3, text).group(1))    
    
    # contains 'name is'
    elif re.search(pattern_4, text):
        if len(re.search(pattern_4, text).group(1).split()) == 1:
            dog_names.append(re.search(pattern_4, text).group(1))
        else:
            temp = re.search(pattern_4, text).group(1).split()
            dog_names.append(temp[0])
        
    # No name specified or other style
    else:
        dog_names.append('None')

# adding this new set of names to our master_copy
master_copy['dog_names'] = dog_names

In [42]:
# we have some new non names, we are going to set them as None,
# because these good dogs have no names.
non_names = []
pattern_4 = r'^[a-z].*'
for name in master_copy['dog_names']:
    if re.search(pattern_4, name):
        master_copy.loc[master_copy['dog_names'] == name, ['dog_names']] = 'None'
        non_names.append(re.search(pattern_4, name).group())

# printing unique names form dog_names
master_copy['dog_names'].unique().tolist()

['None',
 'Dook_Milo',
 'Big',
 'Naphaniel',
 'Frank',
 'Klint',
 'Kial',
 'Olive',
 'Jessiga',
 'Hall and Oates',
 'Filup',
 'Cheryl',
 'Tedrick',
 'Stu',
 'Erik',
 'Cleopatricia',
 'Otis',
 'Jiminy',
 'Alfie',
 'Philippe from Soviet Russia',
 'Kohl',
 'Carll',
 'Keet',
 'Clybe',
 'Gabe',
 'Pipsy',
 'Bradlay',
 'Churlie',
 'Kenneth',
 'Clarence',
 'Timison',
 'Joshwa',
 'Genevieve',
 'Fwed',
 'Biden',
 'Cupcake',
 'Reese and Twips',
 'Alfonso',
 'Skittles',
 'Torque',
 'Bisquick',
 'Ron',
 'Jockson',
 'Jareld',
 'Jeph',
 'Walter',
 'Scout',
 'Kreggory',
 'Jimothy',
 'Christoper',
 'Johm',
 'Lugan',
 'Josep',
 'Octaviath',
 'Tilly',
 'Nelly',
 'Dante',
 'Penny',
 'Aja',
 'Emmy',
 'Shadow',
 'Beau',
 'Jack',
 'Bailey',
 'Maya',
 'Canela',
 'Jeffrey',
 'Gerald',
 'Ralphus',
 'Zeke',
 'Jim',
 'Oliver',
 'Mingus',
 'Bruno',
 'Koda',
 'Zoey',
 'Jax',
 'Franklin',
 'Darla',
 'Archie',
 'Ted',
 'Waffles',
 'Jimbo',
 'Romeo',
 'Jesse',
 'Bella',
 'Gary',
 'Rey',
 'Koko',
 'Alfy',
 'Stanley',
 