## Manipulating Data in Dataframes

In [1]:
#Importing pysark and creating a session
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ManipulatingDf').getOrCreate()
spark

## Taken this dataset from kaggle

**Source:** https://www.kaggle.com/kapastor/democratvsrepublicantweets#ExtractedTweets.csv

This dataset tweets extracted from all of the representives (latest 200 as of May 17th 2018)

In [2]:
#Importing the data 
path ='datasets-intro/'
tweets = spark.read.csv(path+'Rep_vs_Dem_tweets.csv', inferSchema=True, header=True)

In [3]:
tweets.limit(5).toPandas()

Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,"Congress has allocated about $18…""",,
4,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...


In [4]:
#To get the full tweet instead of truncated tweet, I will use select method
tweets.select('Tweet').show(3,False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|Tweet                                                                                                                                       |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L |
|RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…|
|RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages.                                    |
+--------------------------------------------------------------------------------------------------------------------------------------------+

In [5]:
tweets.printSchema()

root
 |-- Party: string (nullable = true)
 |-- Handle: string (nullable = true)
 |-- Tweet: string (nullable = true)



In [6]:
#Identifying any tweet that mentions the handle @LatinoLeader using regexp_extract
from pyspark.sql.functions import *
#withcolumn creates a column, regexp () takes a substring and . takes all characters except new
latino = tweets.withColumn('Latino_mentions', regexp_extract(tweets.Tweet, '(.)(@LatinoLeader)(.)',2)) 
latino.limit(6).toPandas()

Unnamed: 0,Party,Handle,Tweet,Latino_mentions
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...",
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...,
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,
3,"Congress has allocated about $18…""",,,
4,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,@LatinoLeader
5,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...,


In [7]:
#Replacing any value other than 'Democrate' or 'Republican' with 'Other' in party column
#First getting to know how many different parties are present with their count
from pyspark.sql.functions import *
counts = tweets.groupBy("Party").count()
counts.orderBy(desc("count")).show(6)

+--------------------+-----+
|               Party|count|
+--------------------+-----+
|          Republican|44392|
|            Democrat|42068|
|            That’s…"|   28|
|https://t.co/oc6J...|   22|
|                 Now|   17|
|               Today|   13|
+--------------------+-----+
only showing top 6 rows



In [8]:
from pyspark.sql.functions import when

clean = tweets.withColumn('Party', when(tweets.Party == 'Democrat','Democrat').when(tweets.Party == 'Republican','Republican').otherwise('Other'))
counts = clean.groupBy("Party").count()
counts.orderBy(desc("count")).show()

+----------+-----+
|     Party|count|
+----------+-----+
|Republican|44392|
|  Democrat|42068|
|     Other| 6029|
+----------+-----+



In [9]:
#Deleting all the embedded links
print("Cleaned Tweet")
#Using regex to clean tweets which removes embedded links
tweets.withColumn('cleaned', regexp_replace('Tweet', '(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '')).select("cleaned").show(1,False)

Cleaned Tweet
+--------------------------------------------------------------------------------------------------------------------+
|cleaned                                                                                                             |
+--------------------------------------------------------------------------------------------------------------------+
|Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… |
+--------------------------------------------------------------------------------------------------------------------+
only showing top 1 row



In [10]:
#Removing any leading or trailing white space in tweet column
from pyspark.sql.functions import *
tweets.select("Tweet").show(5, False)
tweets.select('Tweet', trim(tweets.Tweet)).show(5,False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|Tweet                                                                                                                                       |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L |
|RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…|
|RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages.                                    |
|null                                                                                                                                        |

In [11]:
#Renaming the "Party" column to 'Dem_Rep'
renamed = tweets.withColumnRenamed('Party','Dem_Rep')
renamed.limit(4).toPandas()

Unnamed: 0,Dem_Rep,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,"Congress has allocated about $18…""",,


In [12]:
#Concatenating the party and handle columns
from pyspark.sql.functions import *
tweets.select(tweets.Party,tweets.Handle, concat_ws('', tweets.Party,tweets.Handle).alias('Concatenated')).show(5,False)

+----------------------------------+-------------+----------------------------------+
|Party                             |Handle       |Concatenated                      |
+----------------------------------+-------------+----------------------------------+
|Democrat                          |RepDarrenSoto|DemocratRepDarrenSoto             |
|Democrat                          |RepDarrenSoto|DemocratRepDarrenSoto             |
|Democrat                          |RepDarrenSoto|DemocratRepDarrenSoto             |
|Congress has allocated about $18…"|null         |Congress has allocated about $18…"|
|Democrat                          |RepDarrenSoto|DemocratRepDarrenSoto             |
+----------------------------------+-------------+----------------------------------+
only showing top 5 rows



In [13]:
#Extracting hashtags from all the tweets
from pyspark.sql.functions import *
# Parenthesis are used to mark a subexpression within a larger expression
# The . matches any character other than a new line
# | means is like or
# \w+ means followed by any word
pattern = '(.|'')(#)(\w+)'
# * is used to match the preceding character zero or more times.
# ? will match the preceding character zero or one times, but no more.
# $ is used to match the ending position in a string. 
split_pattern = r'.*?({pattern})'.format(pattern=pattern)
end_pattern = r'(.*{pattern}).*?$'.format(pattern=pattern)

# $1 here means to capture the first part of the regex result
# The , will separate each find with a comma in the a array we create
df2 = tweets.withColumn('a', regexp_replace('Tweet', split_pattern, '$1,')).where(col('Tweet').like('%#%'))
df2.select('a').show(3,False)
# Remove all the other results that came up
df3 = df2.withColumn('a', regexp_replace('a', end_pattern, '$1'))
df3.select('a').show(3,False)
# Finally create an array from the result by splitting on the comma
df4 = df3.withColumn('a', split('a', r','))
df4.select('a').show(3,False)
df4.limit(3).toPandas()

+-----------------------------------------------------------------------------------------+
|a                                                                                        |
+-----------------------------------------------------------------------------------------+
| #SaveTheInternet, #NetNeutrality, legislation here in the House… https://t.co/n3tggDLU1L|
| #NALCABPolicy2018,.…                                                                    |
| #NetNeutrality, rules. Find out…                                                        |
+-----------------------------------------------------------------------------------------+
only showing top 3 rows

+---------------------------------+
|a                                |
+---------------------------------+
| #SaveTheInternet, #NetNeutrality|
| #NALCABPolicy2018               |
| #NetNeutrality                  |
+---------------------------------+
only showing top 3 rows

+------------------------------------+
|a             

Unnamed: 0,Party,Handle,Tweet,a
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...","[ #SaveTheInternet, #NetNeutrality]"
1,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,[ #NALCABPolicy2018]
2,Democrat,RepDarrenSoto,RT @Tharryry: I am delighted that @RepDarrenSo...,[ #NetNeutrality]


In [14]:
#Creating a dataframe
from pyspark.sql.types import *

sample = [('Santhosh','Sahini','1987-4-8','2016-1-7','2017-2-3','2018-3-2') \
            ,('Sameera','Appana','1986-4-8','2015-1-7','2017-1-3','2018-1-2') \
            ,('Ginny','Ginger','1986-7-10','2014-8-7','2015-2-3','2016-3-2') \
            ,('Vijay','Tracy','1988-5-2','2016-1-7','2018-2-3','2018-3-2') \
            ,('Jon','Ravi','1987-5-11','2016-5-7','2017-1-3','2018-9-2') \
            ,('Sarah','Jones','1956-7-6','2016-4-7','2017-8-3','2018-10-2') \
            ,('John','Johnson','2017-10-12','2018-1-2','2018-10-3','2018-3-2') ]

df = spark.createDataFrame(sample,['first_name','last_name','dob','visit1','visit2','visit3'])
df.show()
df.printSchema()

+----------+---------+----------+--------+---------+---------+
|first_name|last_name|       dob|  visit1|   visit2|   visit3|
+----------+---------+----------+--------+---------+---------+
|  Santhosh|   Sahini|  1987-4-8|2016-1-7| 2017-2-3| 2018-3-2|
|   Sameera|   Appana|  1986-4-8|2015-1-7| 2017-1-3| 2018-1-2|
|     Ginny|   Ginger| 1986-7-10|2014-8-7| 2015-2-3| 2016-3-2|
|     Vijay|    Tracy|  1988-5-2|2016-1-7| 2018-2-3| 2018-3-2|
|       Jon|     Ravi| 1987-5-11|2016-5-7| 2017-1-3| 2018-9-2|
|     Sarah|    Jones|  1956-7-6|2016-4-7| 2017-8-3|2018-10-2|
|      John|  Johnson|2017-10-12|2018-1-2|2018-10-3| 2018-3-2|
+----------+---------+----------+--------+---------+---------+

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- visit1: string (nullable = true)
 |-- visit2: string (nullable = true)
 |-- visit3: string (nullable = true)



Dates are still stored as string, I will convert them again

In [15]:
# Coverting the date columns into date types
df = df.withColumn("dob", df["dob"].cast(DateType())) \
        .withColumn("visit1", df["visit1"].cast(DateType())) \
        .withColumn("visit2", df["visit2"].cast(DateType())) \
        .withColumn("visit3", df["visit3"].cast(DateType()))

# Check to make sure it worked
df.show()
df.printSchema()

+----------+---------+----------+----------+----------+----------+
|first_name|last_name|       dob|    visit1|    visit2|    visit3|
+----------+---------+----------+----------+----------+----------+
|  Santhosh|   Sahini|1987-04-08|2016-01-07|2017-02-03|2018-03-02|
|   Sameera|   Appana|1986-04-08|2015-01-07|2017-01-03|2018-01-02|
|     Ginny|   Ginger|1986-07-10|2014-08-07|2015-02-03|2016-03-02|
|     Vijay|    Tracy|1988-05-02|2016-01-07|2018-02-03|2018-03-02|
|       Jon|     Ravi|1987-05-11|2016-05-07|2017-01-03|2018-09-02|
|     Sarah|    Jones|1956-07-06|2016-04-07|2017-08-03|2018-10-02|
|      John|  Johnson|2017-10-12|2018-01-02|2018-10-03|2018-03-02|
+----------+---------+----------+----------+----------+----------+

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- visit1: date (nullable = true)
 |-- visit2: date (nullable = true)
 |-- visit3: date (nullable = true)



In [19]:
#Calculating a variable showing the length of time between patient visits
from pyspark.sql.functions import *
diff1 = df.select(datediff(df.visit2, df.visit1).alias('diff'))
diff2 = df.select(datediff(df.visit3, df.visit2).alias('diff'))

#Appending the two dataframes together
diff_combo = diff1.union(diff2)
diff_combo.show(5)

+----+
|diff|
+----+
| 393|
| 727|
| 180|
| 758|
| 241|
+----+
only showing top 5 rows



In [21]:
#Calculating the age of each patient
age = df.select(format_number(datediff(df.visit1, df.dob)/365,1).alias("age"))
age.show()

+----+
| age|
+----+
|28.8|
|28.8|
|28.1|
|27.7|
|29.0|
|59.8|
| 0.2|
+----+

