In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [14]:
# Loading the first few rows of the TripAdvisor reviews dataset from a CSV file
file_path = ("tripadvisor_review.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,User ID,Category 1,Category 2,Category 3,Category 4,Category 5,Category 6,Category 7,Category 8,Category 9,Category 10
0,User 1,0.93,1.8,2.29,0.62,0.8,2.42,3.19,2.79,1.82,2.42
1,User 2,1.02,2.2,2.66,0.64,1.42,3.18,3.21,2.63,1.86,2.32
2,User 3,1.22,0.8,0.54,0.53,0.24,1.54,3.18,2.8,1.31,2.5
3,User 4,0.45,1.8,0.29,0.57,0.46,1.52,3.18,2.96,1.57,2.86
4,User 5,0.51,1.2,1.18,0.57,1.54,2.02,3.18,2.78,1.18,2.54


In [15]:
# Define a list of feature names representing different review categories
features = ['Art Galleries', 'Dance Clubs', 'Juice Bars', 'Restaurants', 'Museums', 'Resorts', 'Parks', 'Beaches', 'Theaters', 'Holy Places']
df.columns = ['User ID'] + features
df.head()

Unnamed: 0,User ID,Art Galleries,Dance Clubs,Juice Bars,Restaurants,Museums,Resorts,Parks,Beaches,Theaters,Holy Places
0,User 1,0.93,1.8,2.29,0.62,0.8,2.42,3.19,2.79,1.82,2.42
1,User 2,1.02,2.2,2.66,0.64,1.42,3.18,3.21,2.63,1.86,2.32
2,User 3,1.22,0.8,0.54,0.53,0.24,1.54,3.18,2.8,1.31,2.5
3,User 4,0.45,1.8,0.29,0.57,0.46,1.52,3.18,2.96,1.57,2.86
4,User 5,0.51,1.2,1.18,0.57,1.54,2.02,3.18,2.78,1.18,2.54


In [16]:
# Remove the 'User ID' column from the DataFrame
df = df.drop(columns='User ID')
df.head(10)


Unnamed: 0,Art Galleries,Dance Clubs,Juice Bars,Restaurants,Museums,Resorts,Parks,Beaches,Theaters,Holy Places
0,0.93,1.8,2.29,0.62,0.8,2.42,3.19,2.79,1.82,2.42
1,1.02,2.2,2.66,0.64,1.42,3.18,3.21,2.63,1.86,2.32
2,1.22,0.8,0.54,0.53,0.24,1.54,3.18,2.8,1.31,2.5
3,0.45,1.8,0.29,0.57,0.46,1.52,3.18,2.96,1.57,2.86
4,0.51,1.2,1.18,0.57,1.54,2.02,3.18,2.78,1.18,2.54
5,0.99,1.28,0.72,0.27,0.74,1.26,3.17,2.89,1.66,3.66
6,0.9,1.36,0.26,0.32,0.86,1.58,3.17,2.66,1.22,3.22
7,0.74,1.4,0.22,0.41,0.82,1.5,3.17,2.81,1.54,2.88
8,1.12,1.76,1.04,0.64,0.82,2.14,3.18,2.79,1.41,2.54
9,0.7,1.36,0.22,0.26,1.5,1.54,3.17,2.82,2.24,3.12


In [17]:
# Calculate the sum of each column in the DataFrame to get the total counts or values for each feature
column_totals = df.sum()
column_totals

Art Galleries     875.33
Dance Clubs      1325.56
Juice Bars        993.04
Restaurants       521.85
Museums           920.94
Resorts          1806.04
Parks            3117.32
Beaches          2778.36
Theaters         1538.05
Holy Places      2743.24
dtype: float64

In [18]:
# Create a new DataFrame 'y' by applying a threshold to the 'Resorts' column
# For each value in the 'Resorts' column, assign 1 if the value is greater than or equal to 3, otherwise assign 0
y = pd.DataFrame(df['Resorts'].apply(lambda x:1 if x >= 3 else 0))
y.head()

Unnamed: 0,Resorts
0,0
1,1
2,0
3,0
4,0


In [19]:
# Generate and display summary statistics for each column in the DataFrame, including count, mean, standard deviation, min, max, and quartiles
df.describe()

Unnamed: 0,Art Galleries,Dance Clubs,Juice Bars,Restaurants,Museums,Resorts,Parks,Beaches,Theaters,Holy Places
count,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0,980.0
mean,0.893194,1.352612,1.013306,0.5325,0.939735,1.842898,3.180939,2.835061,1.569439,2.799224
std,0.326912,0.47828,0.788607,0.279731,0.43743,0.539538,0.007824,0.137505,0.364629,0.32138
min,0.34,0.0,0.13,0.15,0.06,0.14,3.16,2.42,0.74,2.14
25%,0.67,1.08,0.27,0.41,0.64,1.46,3.18,2.74,1.31,2.54
50%,0.83,1.28,0.82,0.5,0.9,1.8,3.18,2.82,1.54,2.78
75%,1.02,1.56,1.5725,0.58,1.2,2.2,3.18,2.91,1.76,3.04
max,3.22,3.64,3.62,3.44,3.3,3.76,3.21,3.39,3.17,3.66


In [20]:
# Create a new DataFrame 'X' by making a copy of the original DataFrame and dropping the 'Resorts' column
# This is done to separate the feature variables (X) from the target variable ('Resorts')
X = df.copy().drop(columns='Resorts',axis=1)
X.head()

Unnamed: 0,Art Galleries,Dance Clubs,Juice Bars,Restaurants,Museums,Parks,Beaches,Theaters,Holy Places
0,0.93,1.8,2.29,0.62,0.8,3.19,2.79,1.82,2.42
1,1.02,2.2,2.66,0.64,1.42,3.21,2.63,1.86,2.32
2,1.22,0.8,0.54,0.53,0.24,3.18,2.8,1.31,2.5
3,0.45,1.8,0.29,0.57,0.46,3.18,2.96,1.57,2.86
4,0.51,1.2,1.18,0.57,1.54,3.18,2.78,1.18,2.54


In [21]:
# Split the data into training and testing sets
# X_train and y_train are the training sets, while X_test and y_test are the testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1)

In [22]:
# Initialize a StandardScaler and fit it to the training data to calculate the mean and standard deviation for scaling
# Transform the training data using the fitted scaler to standardize it
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[-0.10738138, -0.72444754, -0.09881318, ...,  0.54870149,
        -0.97328597, -0.92826726],
       [-1.26491209, -0.05667022, -0.32549922, ...,  1.55532269,
        -0.89075336, -1.23694773],
       [-0.68614674,  0.27721844, -0.942589  , ..., -0.17031366,
        -0.09293809,  0.30645461],
       ...,
       [-1.17352809, -0.30708672,  0.07749818, ...,  2.77764844,
        -0.28551419,  0.05951023],
       [-1.69137077, -0.39055888, -1.04333835, ..., -0.09841214,
         0.51230108,  0.18298242],
       [ 0.37999997,  0.02680195, -0.12400052, ...,  0.54870149,
         0.95247503,  1.23249601]])

In [23]:
# Use the previously fitted scaler to transform the test data, applying the same scaling parameters (mean and standard deviation) used on the training data
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[ 0.37999997, -0.89139187,  0.66940062, ..., -0.09841214,
         0.34723586,  1.4177043 ],
       [ 0.10584796, -0.14014238, -0.99296367, ...,  1.05201209,
        -1.24839469,  0.80034336],
       [-1.3562961 ,  0.36069061, -1.06852569, ..., -0.96123031,
         0.42976847,  0.73860727],
       ...,
       [-0.59476274, -0.89139187,  2.21842189, ..., -0.67362426,
        -0.28551419, -1.42215601],
       [-0.86891474, -0.55750321,  1.52577011, ..., -1.53644243,
        -0.72568813, -1.42215601],
       [-1.17352809, -0.97486404, -0.942589  , ...,  0.33299695,
        -1.5235034 ,  0.73860727]])

In [24]:
# Initialize a Logistic Regression model with a specified random state for reproducibility and a maximum of 100 iterations for convergence
logistic_r_model = LogisticRegression(random_state=1,max_iter=100)
# Fit the Logistic Regression model using the training data
lr_model = logistic_r_model.fit(X_train,y_train)
#calculate the accuracy score
lr_model.score(X_test,y_test)

  y = column_or_1d(y, warn=True)


0.9877551020408163