# YouTube Analysis, Trying to Predict Views: Ryan, Hannah, and Spencer

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("comm") \
        .getOrCreate()

In [2]:
data_file = 'YouTube_Final_Data.csv'

In [3]:
# read in data
YouTube_df = spark.read.csv(data_file, header='true', inferSchema='true')

In [4]:
YouTube_df.count()

132802

In [5]:
YouTube_df.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [6]:
# take the first 5 records
YouTube_df.show(2)

+-----------+-------------+--------------------+---------------+-----------+--------------------+--------------------+-------+-----+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|  channel_title|category_id|        publish_time|                tags|  views|likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+---------------+-----------+--------------------+--------------------+-------+-----+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...|   CaseyNeistat|         22|2017-11-13T17:13:...|     SHANtell martin| 748374|57527|    2966|        15954|https://i.ytimg.c...|            False|           False|                 Fals

In [7]:
# check the data type. Just making sure this is a dataframe 
type(YouTube_df)

pyspark.sql.dataframe.DataFrame

### Data Cleaning: Time to dig around a little bit and see what we're working with

In [8]:
from pyspark.sql import functions as F

#here trying to see what the proportion of videos are with comments disabled
YouTube_df.groupBy("comments_disabled").agg(F.count("video_id")).show()

+--------------------+---------------+
|   comments_disabled|count(video_id)|
+--------------------+---------------+
|               FALSE|          78531|
|               False|          40316|
|                null|          11890|
|                TRUE|           1266|
|    sports and more.|            117|
|             Wiz Kid|              4|
| sports and more....|              1|
|                True|            633|
|           Fida Daar|              3|
|            farfalle|             41|
+--------------------+---------------+



In [18]:
#need to address these wonky values as well as the fact that False and FALSE/True and TRUE are different
from pyspark.sql.functions import when
YouTube_df = YouTube_df.withColumn("comments_disabled", when(YouTube_df.comments_disabled == "FALSE","False") \
      .when(YouTube_df.comments_disabled == "TRUE","True") \
      .otherwise(YouTube_df.comments_disabled))
YouTube_df.groupBy("comments_disabled").agg(F.count("video_id")).show()

+--------------------+---------------+
|   comments_disabled|count(video_id)|
+--------------------+---------------+
|               False|         118847|
|                null|          11890|
|    sports and more.|            117|
|             Wiz Kid|              4|
| sports and more....|              1|
|                True|           1899|
|           Fida Daar|              3|
|            farfalle|             41|
+--------------------+---------------+



In [19]:
#now that all of the desired values are False and True instead of all caps, let's filter this bad boy
YouTube_df2 = YouTube_df.filter(YouTube_df.comments_disabled == 'True')
YouTube_df3 = YouTube_df.filter(YouTube_df.comments_disabled == 'False')

In [20]:
YouTube_final = YouTube_df2.union(YouTube_df3)

In [21]:
YouTube_final.groupBy("comments_disabled").agg(F.count("video_id")).show()

+-----------------+---------------+
|comments_disabled|count(video_id)|
+-----------------+---------------+
|            False|         118847|
|             True|           1899|
+-----------------+---------------+



In [None]:
#time to do the same for ratings disabled
YouTube_final.groupBy("ratings_disabled").agg(F.count("video_id")).show()

In [None]:
YouTube_final.columns

In [None]:
#getting some summary statistics for the numeric variables 
youtube_summary = YouTube_final.select("views","likes","dislikes","comment_count").describe()
youtube_summary.show()

In [None]:
'''
here we're trying to figure out where there are missing values. Appears to be a lot of missing values but
it doesn't matter a ton to us because we're going to focus on the description first

Code adapted from: https://sparkbyexamples.com/pyspark/pyspark-find-count-of-null-none-nan-values/
'''
from pyspark.sql.functions import col,isnan, when, count
YouTube_final.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in YouTube_final.columns]
   ).show()

In [None]:
'''
Casting strings as numeric. Not sure why this was giving me issues like this but casting so we can do
correlation matrix
'''
Numeric_only  = YouTube_final.select("views","likes","dislikes","comment_count")
Numeric_only = Numeric_only.withColumn("views",col("views").cast("int"))
Numeric_only = Numeric_only.withColumn("likes",col("likes").cast("int"))
Numeric_only = Numeric_only.withColumn("dislikes",col("dislikes").cast("int"))
Numeric_only = Numeric_only.withColumn("comment_count",col("views").cast("int"))
Numeric_only

In [None]:
'''
Got the correlation matrix for our numeric features.
https://stackoverflow.com/questions/52214404/how-to-get-the-correlation-matrix-of-a-pyspark-data-frame
We can see here that numeric features are pretty heavily correlated. 
'''
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

# convert to vector column first
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=Numeric_only.columns, outputCol=vector_col)
df_vector = assembler.transform(Numeric_only).select(vector_col)

# get correlation matrix
matrix = Correlation.corr(df_vector, vector_col)
matrix.collect()[0]["pearson({})".format(vector_col)].values