In [74]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [75]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [1 InRelease 14.2 kB/88.70% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Waiting f                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 https://developer.download.nvidia.com/compute/

In [76]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("sparkFunctions").getOrCreate()

In [77]:
from pyspark import SparkFiles
url ="https://yelpproject4.s3.amazonaws.com/yelp_restaurants.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("yelp_restaurants.csv"), sep=",", header=True)

# Show DataFrame
df.show()


+--------------------+--------------------+--------------------+-----------+-----+-----------+-------------+---------------+-----+------------+-------+--------------------+--------------------+--------------------+
|         business_id|                name|             address|       city|state|postal_code|     latitude|      longitude|stars|review_count|is_open|          attributes|          categories|               hours|
+--------------------+--------------------+--------------------+-----------+-----+-----------+-------------+---------------+-----+------------+-------+--------------------+--------------------+--------------------+
|6iYb2HFDywm3zjuRg...| Oskar Blues Taproom|        921 Pearl St|    Boulder|   CO|      80302|   40.0175444|   -105.2833481|  4.0|          86|      1|"{'RestaurantsTab...| 'BikeParking': '...| 'BusinessParking...|
|tCbdrRPZA0oiIYSmH...|Flying Elephants ...| 7000 NE Airport Way|   Portland|   OR|      97218|45.5889058992|-122.5933307507|  4.0|         1

In [79]:
df1= df.select(["review_count","categories", "state", "stars"])

In [93]:
yelp = df1.toPandas()
yelp

Unnamed: 0,review_count,categories,state,stars
0,86,'BikeParking': 'True',CO,4.0
1,126,'GoodForKids': 'True',OR,4.0
2,169,'RestaurantsGoodForGroups': 'True',BC,3.5
3,11,"Breakfast & Brunch, Restaurants",OH,4.5
4,39,'intimate': False,MA,4.0
...,...,...,...,...
32017,733,"'WiFi': """"u'no'""""",MA,5.0
32018,437,"'RestaurantsAttire': """"'casual'""""",FL,4.5
32019,310,'street': False,TX,3.0
32020,185,'BusinessAcceptsCreditCards': 'True',GA,3.0


In [94]:
yelp['stars']= yelp['stars'].astype(float)

In [95]:
def ycolumn(x):
  stars = x['stars']
  if stars >= 3:
    return 1
  else:
    return 0
yelp['Stars Y'] = yelp.apply(lambda x: ycolumn(x),axis=1)

In [96]:
yelp['Stars Y']

0        1
1        1
2        1
3        1
4        1
        ..
32017    1
32018    1
32019    1
32020    1
32021    1
Name: Stars Y, Length: 32022, dtype: int64

In [97]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType
schema = [StructField("stars", IntegerType(), True), StructField("review_count", IntegerType(), True),StructField("categories", StringType(), True),StructField("state", StringType(), True) ]
schema

[StructField(stars,IntegerType,true),
 StructField(review_count,IntegerType,true),
 StructField(categories,StringType,true),
 StructField(state,StringType,true)]

In [126]:
X= yelp.drop(["stars", "Stars Y"], axis =1)

X

Unnamed: 0,review_count,categories,state
0,86,'BikeParking': 'True',CO
1,126,'GoodForKids': 'True',OR
2,169,'RestaurantsGoodForGroups': 'True',BC
3,11,"Breakfast & Brunch, Restaurants",OH
4,39,'intimate': False,MA
...,...,...,...
32017,733,"'WiFi': """"u'no'""""",MA
32018,437,"'RestaurantsAttire': """"'casual'""""",FL
32019,310,'street': False,TX
32020,185,'BusinessAcceptsCreditCards': 'True',GA


In [127]:
X_dummies = pd.get_dummies(X)
X_dummies

Unnamed: 0,review_count_10,review_count_100,review_count_1001,review_count_1005,review_count_1009,review_count_101,review_count_1016,review_count_1017,review_count_102,review_count_1021,review_count_1023,review_count_1026,review_count_1028,review_count_103,review_count_1031,review_count_1032,review_count_1033,review_count_1035,review_count_1037,review_count_1038,review_count_1039,review_count_104,review_count_1040,review_count_1044,review_count_1047,review_count_1048,review_count_1049,review_count_105,review_count_1050,review_count_1051,review_count_1054,review_count_1055,review_count_1056,review_count_1057,review_count_1058,review_count_106,review_count_1064,review_count_1065,review_count_1066,review_count_1067,...,"categories_Vegetarian, Juice Bars & Smoothies, Vegan, Restaurants, Food, Latin American","categories_Vegetarian, Restaurants","categories_Vegetarian, Restaurants, Salad, Ice Cream & Frozen Yogurt, Food, Juice Bars & Smoothies, Fast Food","categories_Vegetarian, Vegan, Restaurants","categories_Vegetarian, Vegan, Salad, Restaurants","categories_Venues & Event Spaces, Chinese, Wedding Planning, Restaurants, Event Planning & Services","categories_Venues & Event Spaces, Cinema, Event Planning & Services, Arts & Entertainment, Restaurants, American (New)","categories_Venues & Event Spaces, Nightlife, Comfort Food, Hotels, Event Planning & Services, Restaurants, Bars, Hotels & Travel","categories_Venues & Event Spaces, Social Clubs, Arts & Entertainment, Restaurants, Event Planning & Services","categories_Vietnamese, Restaurants","categories_Vietnamese, Restaurants, Gluten-Free, Vegetarian","categories_Vietnamese, Soup, Restaurants, Noodles","categories_Waffles, Food, Bubble Tea, Hot Pot, Restaurants","categories_Waffles, Food, Cafes, Themed Cafes, Coffee & Tea, Restaurants","categories_Waffles, Food, Restaurants, Creperies, Acai Bowls","categories_Waffles, Taiwanese, Restaurants","categories_Wedding Planning, Event Planning & Services, Education, Restaurants, Social Clubs, Arts & Entertainment, American (New), Colleges & Universities, Venues & Event Spaces","categories_Wedding Planning, Event Planning & Services, Venues & Event Spaces, Caterers, Restaurants","categories_Wine Bars, Bars, Food, Bubble Tea, Restaurants, Nightlife, Steakhouses, Seafood, Asian Fusion","categories_Wine Bars, Nightlife, Cocktail Bars, Mexican, Bars, Vegan, Tacos, Restaurants","categories_Wine Bars, Pizza, Beer Bar, Bars, Restaurants, Nightlife","categories_Wine Tasting Room, Tapas/Small Plates, Nightlife, Restaurants, American (New), Wine Bars, Bars, Wineries, Food, Arts & Entertainment","categories_Wraps, Restaurants, Soup, Salad","categories_Wraps, Salad, Greek, Restaurants",state_ABE,state_BC,state_CO,state_FL,state_GA,state_KS,state_KY,state_MA,state_MN,state_NH,state_OH,state_OR,state_TX,state_VA,state_WA,state_WY
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32017,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
32018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
32019,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
32020,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [None]:
#

In [128]:
# Assign the data to X and y

y = yelp['Stars Y']


In [129]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, random_state=42)

In [130]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [131]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [132]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8399816788807462
Testing Data Score: 0.8411191606295279


In [133]:
print(f'Actual:\t\t{list(y_test[:10])}')
print(f'Predicted:\t{list(classifier.predict(X_test[:10]))}')

Actual:		[1, 0, 1, 0, 0, 1, 0, 1, 1, 1]
Predicted:	[1, 1, 1, 1, 1, 1, 0, 1, 1, 1]


In [134]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test
y_pred = classifier.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[  28, 1244],
       [  28, 6706]])

In [135]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.02      0.04      1272
           1       0.84      1.00      0.91      6734

    accuracy                           0.84      8006
   macro avg       0.67      0.51      0.48      8006
weighted avg       0.79      0.84      0.77      8006

