<a href="https://colab.research.google.com/github/rayapureddi/COGNIFYZ/blob/main/Cognifyz_ML_Internship.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**TASK-1**

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [41]:
#Load the dataset
df = pd.read_csv("restaurant_data.csv")

In [42]:
#Get the column names
columns_list = df.columns.tolist()

In [43]:
#print the columns names
print("Columns in the dataset:")
print(columns_list)

Columns in the dataset:
['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address', 'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two', 'Currency', 'Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu', 'Price range', 'Aggregate rating', 'Rating color', 'Rating text', 'Votes']


In [44]:
#print datatypes
print(df.dtypes)

Restaurant ID             int64
Restaurant Name          object
Country Code              int64
City                     object
Address                  object
Locality                 object
Locality Verbose         object
Longitude               float64
Latitude                float64
Cuisines                 object
Average Cost for two      int64
Currency                 object
Has Table booking        object
Has Online delivery      object
Is delivering now        object
Switch to order menu     object
Price range               int64
Aggregate rating        float64
Rating color             object
Rating text              object
Votes                     int64
dtype: object


In [45]:
#Select relevant features and target variable
X = df.drop("Aggregate rating", axis=1)
y = df["Aggregate rating"]

In [46]:
#print non numeric values
non_numeric_columns = df.select_dtypes(exclude=['float64', 'int64']).columns
print("Non-numeric columns:", non_numeric_columns)

Non-numeric columns: Index(['Restaurant Name', 'City', 'Address', 'Locality', 'Locality Verbose',
       'Cuisines', 'Currency', 'Has Table booking', 'Has Online delivery',
       'Is delivering now', 'Switch to order menu', 'Rating color',
       'Rating text'],
      dtype='object')


In [47]:
# Drop the 'Cuisines' column (if it is non-numeric and not used for prediction)
df = df.drop('Cuisines', axis=1)

In [48]:
#Encode categorical variables using one-hot encoding
X_encoded = pd.get_dummies(X, drop_first=True)

In [49]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [50]:
#Train the Decision Tree Regression model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

In [51]:
#Make predictions on the testing data
y_pred = model.predict(X_test)

In [52]:
#Evaluate the model's performance using Mean Squared Error (MSE) and R-squared (R2)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)

Mean Squared Error (MSE): 0.0489586603872318
R-squared (R2): 0.9784901950672517


In [53]:
#Analyze the most influential features affecting restaurant ratings (optional)
#Decision trees allow you to interpret feature importance easily. You can analyze the feature_importances_ attribute of the model.
importance = model.feature_importances_
feature_names = X_encoded.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
sorted_features = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nMost Influential Features:")
print(sorted_features)


Most Influential Features:
                                                 Feature    Importance
20825                              Rating text_Not rated  8.966539e-01
20819                                Rating color_Orange  5.152611e-02
20826                                   Rating text_Poor  2.219755e-02
20824                                   Rating text_Good  1.308405e-02
20818                                 Rating color_Green  2.579984e-03
...                                                  ...           ...
20103                           Cuisines_Japanese, Steak -1.008831e-19
3198      Restaurant Name_Jaco's Bayfront Bar and Grille -2.017661e-19
2700                   Restaurant Name_Grand Madras Cafe -2.017661e-19
15969  Address_Shop 8, Mount Kailash Market, East of ... -2.017661e-19
17659                                 Locality_Vikaspuri -1.614129e-18

[20828 rows x 2 columns]


**TASK-2**

In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [55]:
#Load the dataset and handle missing values (if any)
df = pd.read_csv("restaurant_data.csv")

In [56]:
#Preprocess the dataset - Encode categorical variables
# Convert 'Cuisines' column to lowercase for case-insensitive matching
df['Cuisines'] = df['Cuisines'].str.lower()

In [57]:
#Determine the criteria for restaurant recommendations
#Sample user preferences - replace these with actual user preferences
sample_user_cuisine_preference = 'north indian'
sample_user_price_range_preference = 2.0  #Assuming 2.0 corresponds to moderate-priced restaurants

In [58]:
#Implement content-based filtering using Decision Tree Regressor
#Encode 'Cuisines' using one-hot encoding
X = pd.get_dummies(df['Cuisines'], drop_first=True)
y = df['Aggregate rating']

In [59]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
# Train the Decision Tree Regressor model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

In [61]:
#Test the recommendation system
#Get the encoded cuisine preference for the sample user preference
sample_user_cuisine_encoded = pd.get_dummies([sample_user_cuisine_preference.lower()], prefix='', prefix_sep='').reindex(columns=X.columns, fill_value=0)
#Predict the restaurant rating based on the user's cuisine preference using the Decision Tree Regressor model
predicted_rating = model.predict(sample_user_cuisine_encoded)

print(f"User Preference: Cuisine={sample_user_cuisine_preference}, Price Range={sample_user_price_range_preference}")
print(f"Predicted Restaurant Rating: {predicted_rating[0]}")

User Preference: Cuisine=north indian, Price Range=2.0
Predicted Restaurant Rating: 1.6851498637602187


In [62]:
#Evaluate the model's performance (optional)
#Make predictions on the testing data
y_pred = model.predict(X_test)
#Evaluate the model's performance using Mean Squared Error (MSE) and R-squared (R2)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Performance:")
print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)


Model Performance:
Mean Squared Error (MSE): 1.9332763175551513
R-squared (R2): 0.1506222567609634


**TASK-3**

In [63]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [64]:
# Generate a synthetic dataset for demonstration
np.random.seed(42)
num_samples = 1000
num_features = 5
X = np.random.randn(num_samples, num_features)
y = np.random.randint(2, size=num_samples)


In [65]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
# Initialize the logistic regression model
logistic_model = LogisticRegression()

In [67]:
# Train the model using the training data
logistic_model.fit(X_train, y_train)

In [68]:
#Make predictions on the testing data
y_pred = logistic_model.predict(X_test)

In [69]:
#Evaluate the model's performance using confusion metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
confusion = confusion_matrix(y_test, y_pred)

print("Model Performance:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confuion Matrix:\n", confusion)

Model Performance:
Accuracy: 0.48
Precision: 0.47874188311688315
Recall: 0.48
F1 Score: 0.47817103808662437
Confuion Matrix:
 [[41 57]
 [47 55]]


**TASK-4**

In [70]:
import pandas as pd
import plotly.express as px

In [71]:
#Load the dataset
df = pd.read_csv("restaurant_data.csv")

In [72]:
#Explore the latitude and longitude coordinates of the restaurants and visualize their distribution on a map
fig = px.scatter_mapbox(df, lat='Latitude', lon='Longitude', hover_name='Restaurant Name', hover_data=['Locality', 'City', 'Cuisines', 'Aggregate rating'],
                        color='Aggregate rating', size='Votes', size_max=15, zoom=10)

In [73]:
#Customize the map layout
fig.update_layout(mapbox_style='open-street-map')
fig.update_layout(margin={'r': 0, 't': 0, 'l': 0, 'b': 0})
fig.show()

In [74]:
#Group the restaurants by city or locality and analyze the concentration of restaurants in different areas
#Group by city and count the number of restaurants in each city
restaurant_count_by_city = df.groupby('City').size().reset_index(name='Restaurant Count')

In [75]:
#Display the restaurant count by city
print(restaurant_count_by_city)

                City  Restaurant Count
0          Abu Dhabi                20
1               Agra                20
2          Ahmedabad                21
3             Albany                20
4          Allahabad                20
..               ...               ...
136          Weirton                 1
137  Wellington City                20
138   Winchester Bay                 1
139          Yorkton                 1
140        ��stanbul                14

[141 rows x 2 columns]


In [76]:
#Calculate statistics such as the average ratings, cuisines, or price ranges by city or locality
#Group by city and calculate the average rating for each city
average_rating_by_city = df.groupby('City')['Aggregate rating'].mean().reset_index(name='Average Rating')

In [77]:
#Display the average rating by city
print(average_rating_by_city)

                City  Average Rating
0          Abu Dhabi        4.300000
1               Agra        3.965000
2          Ahmedabad        4.161905
3             Albany        3.555000
4          Allahabad        3.395000
..               ...             ...
136          Weirton        3.900000
137  Wellington City        4.250000
138   Winchester Bay        3.200000
139          Yorkton        3.300000
140        ��stanbul        4.292857

[141 rows x 2 columns]


In [78]:
#Analyze the data further to identify any interesting insights or patterns in restaurant locations.
#use plotly express to create a bar chart showing the average rating by city.
fig_rating_by_city = px.bar(average_rating_by_city, x='City', y='Average Rating', labels={'Average Rating': 'Average Rating'},
                            title='Average Rating by City')
fig_rating_by_city.show()