<p style="text-align:center">
        <img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/assets/logos/SN_web_lightmode.png" width="300" alt="Skills Network Logo">
</p>


### Analyse search terms on the e-commerce web server


##### In this assignment you will download the search term data set for the e-commerce web server and run analytic queries on it.


In [1]:
import atexit
from pathlib import Path

In [2]:
# Install spark
!pip install pyspark
!pip install findspark



In [45]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col

from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml.feature import VectorAssembler

In [12]:
# Start session

# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("IBM_DEng_Capstone_Project") \
    .getOrCreate()

atexit.register(lambda : sc.stop())
atexit.register(lambda : spark.stop());

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at /tmp/ipykernel_1799/2151308266.py:4 

In [5]:
# Download The search term dataset from the below url
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv

--2024-06-26 14:53:12--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104, 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 233457 (228K) [text/csv]
Saving to: ‘searchterms.csv.2’


2024-06-26 14:53:12 (57.0 MB/s) - ‘searchterms.csv.2’ saved [233457/233457]



In [6]:
searchterms_path = Path("/resources") / "labs" / "DB0321EN" / "searchterms.csv"
assert searchterms_path.exists()

In [7]:
!head -n1 {str(searchterms_path)}

﻿day,month,year,searchterm


In [11]:
# Load the csv into a spark dataframe

# Define schema
schema = StructType([
    StructField("day", IntegerType(), True),
    StructField("month", IntegerType(), True),
    StructField("year", IntegerType(), True),
    StructField("searchterm", StringType(), True)
])

# Read CSV file with schema
df_searchterms = spark.read.csv(str(searchterms_path), schema=schema, header=True)

df_searchterms.persist()
atexit.register(lambda : df_searchterms.unpersist());

24/06/26 14:53:56 WARN execution.CacheManager: Asked to cache already cached data.


In [15]:
# Print the number of rows and columns
# Take a screenshot of the code and name it as shape.jpg)

n_rows = df_searchterms.count()
n_cols = len(df_searchterms.columns)

print(f"Number of Rows:{n_rows}\nNumber of Columns: {n_cols}")

Number of Rows:10000
Number of Columns: 4


In [17]:
# Print the top 5 rows
# Take a screenshot of the code and name it as top5rows.jpg)
df_searchterms.show(5)

+---+-----+----+--------------+
|day|month|year|    searchterm|
+---+-----+----+--------------+
| 12|   11|2021| mobile 6 inch|
| 12|   11|2021| mobile latest|
| 12|   11|2021|   tablet wifi|
| 12|   11|2021|laptop 14 inch|
| 12|   11|2021|     mobile 5g|
+---+-----+----+--------------+
only showing top 5 rows



In [19]:
# Find out the datatype of the column searchterm?
# Take a screenshot of the code and name it as datatype.jpg)

df_searchterms.select("searchterm").printSchema()

root
 |-- searchterm: string (nullable = true)



In [24]:
# How many times was the term `gaming laptop` searched?
# Take a screenshot of the code and name it as gaminglaptop.jpg)

filtered_df = df_searchterms.filter(col("searchterm").contains("gaming laptop"))

count = filtered_df.count()
print("Result:", count)

Result: 499


In [33]:
# Print the top 5 most frequently used search terms?
# Take a screenshot of the code and name it as top5terms.jpg)

top_search_terms = df_searchterms.groupBy('searchterm').count().orderBy('count', ascending=False).select('searchterm', 'count')
top_search_terms.show(5)



+-------------+-----+
|   searchterm|count|
+-------------+-----+
|mobile 6 inch| 2312|
|    mobile 5g| 2301|
|mobile latest| 1327|
|       laptop|  935|
|  tablet wifi|  896|
+-------------+-----+
only showing top 5 rows



                                                                                

In [39]:
# The pretrained sales forecasting model is available at  the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz
! tar -xzvf model.tar.gz
! rm *.tar.gz*

--2024-06-26 15:14:27--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104, 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1490 (1.5K) [application/x-tar]
Saving to: ‘model.tar.gz’


2024-06-26 15:14:27 (13.0 MB/s) - ‘model.tar.gz’ saved [1490/1490]

sales_prediction.model/
sales_prediction.model/metadata/
sales_prediction.model/metadata/part-00000
sales_prediction.model/metadata/.part-00000.crc
sales_prediction.model/metadata/_SUCCESS
sales_prediction.model/metadata/._SUCCESS.crc
sales_prediction.model/data/
sales_prediction.model/data/part-00000-1db9fe2f-4d93-4b1f-966b-3b09

In [43]:
# Load the sales forecast model.
# Take a screenshot of the code and name it as loadmodel.jpg)

predictor_model = LinearRegressionModel.load("sales_prediction.model")

In [51]:
# Using the sales forecast model, predict the sales for the year of 2023.
# Take a screenshot of the code and name it as forecast.jpg

def predict(year: int):
    data = [(year,)]
    columns = ["year"]
    df = spark.createDataFrame(data, columns)

    # Assemble the features
    assembler = VectorAssembler(inputCols=["year"], outputCol="features")
    transformed_df = assembler.transform(df).select("features", "year")

    predictions = predictor_model.transform(transformed_df)
    predictions.select("prediction").show()

predict(2023)


                                                                                

+------------------+
|        prediction|
+------------------+
|175.16564294006457|
+------------------+



24/06/26 15:27:01 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/06/26 15:27:01 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
