In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=1347ab2602a72e9c20adf5a7cfa533b5bd91432e522e4be1dccac63c096a93cd
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


Initial spark session, libraries and file

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import NaiveBayes, LogisticRegression, LinearSVC
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a Spark session
spark = SparkSession.builder.appName("ModelSelection").getOrCreate()

# Load the CSV file into a DataFrame
data = spark.read.csv("/content/daily_dose_of_internet.csv", header=True)


In [None]:
data.printSchema()

root
 |-- Video URL: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Views: string (nullable = true)
 |-- Likes: string (nullable = true)
 |-- Publish Date: string (nullable = true)
 |-- Comments Count: string (nullable = true)
 |-- User Name: string (nullable = true)
 |-- Date And Time: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Thumbs Up: string (nullable = true)



Actions:

In [None]:
data.show(5)

+--------------------+--------------------+-------+------+-------------------+--------------+--------------------+-------------------+--------------------+---------+
|           Video URL|               Title|  Views| Likes|       Publish Date|Comments Count|           User Name|      Date And Time|             Comment|Thumbs Up|
+--------------------+--------------------+-------+------+-------------------+--------------+--------------------+-------------------+--------------------+---------+
|https://www.youtu...|Baby Discovers Gl...|5819784|252147|2023-07-24 23:32:45|         11046|@-universalbeast-...|2023-07-24 23:33:01|                   1|     21.0|
|https://www.youtu...|Baby Discovers Gl...|5819784|252147|2023-07-24 23:32:45|         11046|           @yeet__gt|2023-07-24 23:33:05|                   W|     NULL|
|https://www.youtu...|Baby Discovers Gl...|5819784|252147|2023-07-24 23:32:45|         11046|            @athumen|2023-07-24 23:33:06|Guess I got here ...|     NULL|
|htt

In [None]:
rdd2 = data.count()
print(f"Number of rows in data : {rdd2}")

Number of rows in data : 61850


In [None]:
rdd3 =data.first()
print(f"The first element of dataset: {rdd3}")

The first element of dataset: Row(Video URL='https://www.youtube.com/watch?v=-K8Q_2zQMmc', Title='Baby Discovers Glitch in Reality', Views='5819784', Likes='252147', Publish Date='2023-07-24 23:32:45', Comments Count='11046', User Name='@-universalbeast-9558', Date And Time='2023-07-24 23:33:01', Comment='1', Thumbs Up='21.0', Title_Length=32)


In [None]:
# Assuming you have loaded your data into a DataFrame called 'data'
# Perform any necessary preprocessing steps
preprocessed_data = data.na.drop()  # Drop rows with missing values


Transformations:

In [None]:
filtered_data = data.filter(data["Likes"] > 0)
filtered_data

DataFrame[Video URL: string, Title: string, Views: string, Likes: string, Publish Date: string, Comments Count: string, User Name: string, Date And Time: string, Comment: string, Thumbs Up: string, Title_Length: int]

In [None]:
avg_delay_by_origin = data.groupBy("Views")
avg_delay_by_origin

GroupedData[grouping expressions: [Views], value: [Video URL: string, Title: string ... 9 more fields], type: GroupBy]

In [None]:
sorted_data = data.orderBy("Title")
sorted_data

DataFrame[Video URL: string, Title: string, Views: string, Likes: string, Publish Date: string, Comments Count: string, User Name: string, Date And Time: string, Comment: string, Thumbs Up: string, Title_Length: int]

Data Preprocessing

In [None]:
from pyspark.sql.functions import length

# Assume 'data' is your DataFrame
# Create a new feature representing the length of the title
data = data.withColumn('Title_Length', length('Title'))
print(data)

DataFrame[Video URL: string, Title: string, Views: string, Likes: string, Publish Date: string, Comments Count: string, User Name: string, Date And Time: string, Comment: string, Thumbs Up: string, Title_Length: int]


In [None]:
from pyspark.sql.functions import when

# Replace missing values in 'Likes' column with a default value
data = data.withColumn('Likes', when(data['Likes'].isNull(), 0).otherwise(data['Likes']))


In [None]:
new_data = data.select('Title','Views','Likes',"Comments Count")

In [None]:
new_data.show()

+--------------------+-------+------+--------------+
|               Title|  Views| Likes|Comments Count|
+--------------------+-------+------+--------------+
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         11046|
|Baby Discovers Gl...|5819784|252147|         

In [None]:
new_data.select("Title").distinct().show()

+--------------------+
|               Title|
+--------------------+
|Security Camera C...|
|Didn’t Know He Wa...|
| This Is An Air Fish|
|Baby Discovers Gl...|
|Man Saves Frozen ...|
|Kid Gets Caught T...|
|The Best Of The I...|
+--------------------+



In [None]:
new_data.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Views: string (nullable = true)
 |-- Likes: string (nullable = true)
 |-- Comments Count: string (nullable = true)



In [None]:
new_data.columns

['Title', 'Views', 'Likes', 'Comments Count']

In [None]:
from pyspark.sql.functions import count, when, isnan, isnull, col

# Check for missing values
missing_values = new_data.select([count(when(isnan(c) | isnull(c), c)).alias(c) for c in new_data.columns])
missing_values.show()



# Check the variability of the target variable ("Likes")
target_summary = new_data.select("Likes").summary()
target_summary.show()


+-----+-----+-----+--------------+
|Title|Views|Likes|Comments Count|
+-----+-----+-----+--------------+
|    0|    0|    0|             0|
+-----+-----+-----+--------------+

+-------+-----------------+
|summary|            Likes|
+-------+-----------------+
|  count|            61850|
|   mean|385283.5668391269|
| stddev|192251.3272354196|
|    min|          1130274|
|    25%|         252147.0|
|    50%|         348031.0|
|    75%|         359113.0|
|    max|           522738|
+-------+-----------------+



In [None]:
from pyspark.sql.functions import col
new_data = new_data.withColumn("Views", col("Views").cast("float"))
new_data = new_data.withColumn("Comments Count", col("Comments Count").cast("float"))
new_data = new_data.withColumn("Likes", col("Likes").cast("float"))

In [None]:
from pyspark.ml.feature import StringIndexer
index = StringIndexer(inputCol ='Title',outputCol ='Title1')
new_data1 = index.fit(new_data).transform(new_data)


In [None]:
new_data1.distinct().show()

+--------------------+-----------+---------+--------------+------+
|               Title|      Views|    Likes|Comments Count|Title1|
+--------------------+-----------+---------+--------------+------+
|Baby Discovers Gl...|  5819784.0| 252147.0|       11046.0|   4.0|
| This Is An Air Fish|  9203380.0| 192591.0|       12344.0|   5.0|
|Man Saves Frozen ...|1.3031411E7| 359113.0|       12280.0|   3.0|
|Security Camera C...|1.1582199E7| 348031.0|       18518.0|   0.0|
|Didn’t Know He Wa...|  6949914.0| 326700.0|       10333.0|   2.0|
|The Best Of The I...|4.9366644E7|1130274.0|       34986.0|   6.0|
|Kid Gets Caught T...|2.2681858E7| 522738.0|       17398.0|   1.0|
+--------------------+-----------+---------+--------------+------+



In [None]:
new_data1.select("Title1").distinct().show()

+------+
|Title1|
+------+
|   0.0|
|   4.0|
|   3.0|
|   2.0|
|   5.0|
|   1.0|
|   6.0|
+------+



In [None]:
new_data1.columns

['Title', 'Views', 'Likes', 'Comments Count', 'Title1']

In [None]:

# Select relevant features and target variable
feature_columns = ['Views','Comments Count', 'Title1']  # Add relevant feature columns
assembler = VectorAssembler(inputCols=feature_columns, outputCol="feature")
df = assembler.transform(new_data1)



Model Building

In [None]:
#linear regression
# Split data into training and testing sets
train_data, test_data = df.randomSplit([0.8, 0.2],seed =42)

# Create a Linear Regression model
lr = LinearRegression(featuresCol="feature", labelCol="Likes")

# Fit the model
lr_model = lr.fit(train_data)

# Evaluate the model
predictions = lr_model.transform(test_data)

# Create a RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="Likes", predictionCol="prediction", metricName="r2")

# Calculate R2 score
r2_score = evaluator.evaluate(predictions)
print("R2 Score:", r2_score)


R2 Score: 0.932172062019895


Inference:

 R2 score of approximately 0.93 for the Linear Regression model, we can infer that the model explains about 93.2% of the variability in the number of "Likes" based on the provided features. This high R2 score indicates a strong relationship between the features and the target variable.

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor

# Create a Decision Tree classifier
dt = DecisionTreeRegressor(labelCol="Likes", featuresCol="feature")

# Train the model
dt_model = dt.fit(train_data)

dt_pred = dt_model.transform(test_data)

# Create a RegressionEvaluator
evaluator2 = RegressionEvaluator(labelCol="Likes", predictionCol="prediction", metricName="r2")

# Calculate R2 score
r2_score = evaluator2.evaluate(dt_pred)
print("R2 Score:", r2_score)

R2 Score: 1.0


Inference:
R2 score of a Decision Tree Regressor is 1.0, it suggests that the model perfectly predicts the variability in the target variable based on the provided features. In other words, the model explains 100% of the variance in the target variable, achieving a perfect fit to the training data.

In [None]:
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor

# Random Forest Regression
rf = RandomForestRegressor(labelCol="Likes", featuresCol="feature")
rf_model = rf.fit(train_data)
rf_pred = rf_model.transform(test_data)

# Create a RegressionEvaluator
evaluator2 = RegressionEvaluator(labelCol="Likes", predictionCol="prediction", metricName="r2")
# Evaluations for Random Forest Regression
rf_r2_score = evaluator.evaluate(rf_pred)
print("Random Forest R2 Score:", rf_r2_score)



Random Forest R2 Score: 0.999937678917951


Inference:
R2 score of approximately 0.999937 for the RandomForestRegressor model, we can infer that the model explains an exceptionally high proportion of the variance in the number of "Likes" based on the provided features.


In [None]:
# Gradient Boosted Tree Regression
gbt = GBTRegressor(labelCol="Likes", featuresCol="feature", maxIter=10)
gbt_model = gbt.fit(train_data)
gbt_pred = gbt_model.transform(test_data)

# Evaluations for Gradient Boosted Tree Regression
gbt_r2_score = evaluator.evaluate(gbt_pred)
print("Gradient Boosted Tree R2 Score:", gbt_r2_score)

Gradient Boosted Tree R2 Score: 1.0


INference:

R2 score signifies an excellent fit between the model and the data, it's essential to interpret the results cautiously, considering factors such as overfitting and model complexity. Nonetheless, the GBT Regressor model's ability to perfectly explain the variability in "Likes" suggests it's a powerful tool for predicting engagement metrics in this context


In [None]:
from pyspark.ml.regression import GeneralizedLinearRegression

# Create a Lasso Regression model
lasso = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.1, labelCol="Likes", featuresCol="feature")

# Train the model
lasso_model = lasso.fit(train_data)

# Make predictions
lasso_pred = lasso_model.transform(test_data)

# Evaluate Lasso Regression model
lasso_evaluator = RegressionEvaluator(labelCol="Likes", predictionCol="prediction", metricName="r2")
lasso_r2_score = lasso_evaluator.evaluate(lasso_pred)
print("Lasso Regression R2 Score:", lasso_r2_score)



Lasso Regression R2 Score: 0.9321720599918815


Inference:

 R2 score of approximately 0.9321720 for the Generalized Linear Regression model, we can infer that the model explains about 93.2% of the variability in the number of "Likes" based on the provided features. This high R2 score indicates a strong relationship between the features and the target variable, similar to the Linear Regression model.