# Data prep before training⭐

This is essential part of week 3. run this before training your ml.



In [1]:
from pyspark.sql import SparkSession 


In [2]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName("Intro") \
    .getOrCreate()

## Exercise 1: Load the data:

In [3]:
df = spark.read.csv ('../datasets/bot_data.csv', header= True) 

In [4]:
from pyspark.sql.types import IntegerType

df = df.withColumn("followers_count", df["followers_count"].cast(IntegerType()))

In [5]:
from pyspark.sql.types import IntegerType, BooleanType

df = df.withColumn("friends_count", df["friends_count"].cast(IntegerType()))
df = df.withColumn("listed_count", df["listed_count"].cast(IntegerType()))
df = df.withColumn("favourites_count", df["favourites_count"].cast(IntegerType()))
df = df.withColumn("statuses_count", df["statuses_count"].cast(IntegerType()))
df = df.withColumn("verified", df["verified"].cast(BooleanType()))
df = df.withColumn("default_profile", df["default_profile"].cast(BooleanType()))
df = df.withColumn("has_extended_profile", df["has_extended_profile"].cast(BooleanType()))
df = df.withColumn("default_profile_image", df["default_profile_image"].cast(BooleanType()))

In [6]:
from pyspark.sql.functions import when
df = df.withColumn('id_str',when(df['id'].isNotNull(),df['id']).otherwise(df['id_str']))

In [7]:
df = df.drop('id')

In [8]:
df = df.drop('default_profile_image','has_extended_profile','url','created_at','lang')

In [9]:
df = df.dropDuplicates()


In [10]:
df = df.withColumn('location',when(df['location'].isNull(),0).otherwise(1))
df = df.withColumn('status',when(df['status'].isNull(),0).otherwise(1))
df = df.withColumn('screen_name',when(df['screen_name'].isNull(),0).otherwise(1))
df = df.withColumn('name',when(df['name'].isNull(),0).otherwise(1))

---

In [11]:
df = df.withColumn('bot',df['bot'].cast(IntegerType()))


In [12]:
df = df.withColumn('bot',when(df['bot'].isNull(),0).otherwise(df['bot']))


Do the same with the other booelan fields:
    Run next commends:

In [13]:
df = df.withColumn('verified',df['verified'].cast(IntegerType()))
df = df.withColumn('default_profile',df['default_profile'].cast(IntegerType()))

df = df.withColumn('verified',when(df['verified'].isNull(),0).otherwise(df['verified']))
df = df.withColumn('default_profile',when(df['default_profile'].isNull(),0).otherwise(df['default_profile']))


How many bots and none bots we have in the data?

Run the next command to check out! 

In [14]:
df = df.drop('id_str')



In [15]:
df.printSchema()

root
 |-- screen_name: integer (nullable = false)
 |-- location: integer (nullable = false)
 |-- description: string (nullable = true)
 |-- followers_count: integer (nullable = true)
 |-- friends_count: integer (nullable = true)
 |-- listed_count: integer (nullable = true)
 |-- favourites_count: integer (nullable = true)
 |-- verified: integer (nullable = true)
 |-- statuses_count: integer (nullable = true)
 |-- status: integer (nullable = false)
 |-- default_profile: integer (nullable = true)
 |-- name: integer (nullable = false)
 |-- bot: integer (nullable = true)



In [16]:
# in order to avoid errors, drop rows with null/None or N/A for description
df = df.dropna(subset=['description'])

In [17]:
# Run the next commend, we will need it for chapter number 4
df_desc = df.selectExpr('description','bot as label')
df_desc.write.mode('overwrite').parquet("../datasets/train_data_only_description")


---

In [18]:
# check out this python code. Run it. What did you get?
# will this work for the task?
# how do you combine it with UDF?

def split_and_set(some_str):
    if isinstance(some_str, str):
        some_str = ''.join(c for c in some_str if c not in "[](){}<>,'/.")
        return list(set(some_str.split(' ')))
    return some_str

some_str = '[csds b lol,a]'

split_and_set(split_and_set(some_str))

['lola', 'csds', 'b']

In [19]:
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf

def split_and_set(some_str):
    if isinstance(some_str, str):
        some_str = ''.join(c for c in some_str if c not in "[](){}<>,'/.")
        return list(set(some_str.split(' ')))
    return some_str

list_udf = udf(lambda y: split_and_set(y), ArrayType(StringType()))

df = df.withColumn('description', list_udf(df['description']))

In [20]:
df.printSchema()

root
 |-- screen_name: integer (nullable = false)
 |-- location: integer (nullable = false)
 |-- description: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- followers_count: integer (nullable = true)
 |-- friends_count: integer (nullable = true)
 |-- listed_count: integer (nullable = true)
 |-- favourites_count: integer (nullable = true)
 |-- verified: integer (nullable = true)
 |-- statuses_count: integer (nullable = true)
 |-- status: integer (nullable = false)
 |-- default_profile: integer (nullable = true)
 |-- name: integer (nullable = false)
 |-- bot: integer (nullable = true)



In [21]:
# happy with the results? write to file!
# run this command
df.write.mode('overwrite').parquet("../datasets/final_train_data")

In [22]:
(training_data, test_data) = df.randomSplit([0.7, 0.3])

In [23]:
test_data.write.mode('overwrite').parquet("../datasets/classified_test_data")

In [24]:
training_data.write.mode('overwrite').parquet("../datasets/classified_train_data")