# 🎥 Movie Recommendation & Sentiment Analysis System with Weather Info
This project combines:
- A Movie Recommendation System
- Sentiment Analysis of user reviews
- Weather Forecast display (for additional info)

Technologies used: Python, Pandas, Scikit-Learn, NLTK, Flask, OpenWeatherMap API 


Imports and Setup

In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Load Dataset

In [4]:
data = pd.read_csv('datasets/netflix_titles.csv',encoding='latin1')

In [5]:
data.shape

(8809, 26)

In [10]:
data.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

 Data Preprocessing & Cleaning

In [11]:
for n in range(12,26):
    data.drop(columns=[f'Unnamed: {n}'],inplace=True)

In [12]:
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [13]:
data.shape

(8809, 12)

In [14]:
data.dropna(inplace=True)

In [15]:
data_100 = data.head(100)
data_100.to_csv('datasets/netflix_titles_100.csv')

Feature Engineering

In [18]:
tfidf = TfidfVectorizer()
tfidf_matrix=tfidf.fit_transform(data['description'])

In [22]:
data.iloc[0]['title']

'Sankofa'

In [24]:
movie_index = data[data['title']=='Sankofa'].index[0]
movie_index

7

In [27]:
tfidf_movie=tfidf_matrix[movie_index]
desc_cosine_sim = cosine_similarity(tfidf_matrix,tfidf_movie)

In [28]:
recommendation = np.argsort(desc_cosine_sim.flatten())[::-1][:5]

In [29]:
[data.iloc[i][['title','description']] for i in recommendation]

[title                                                   Paranoia
 description    Blackmailed by his company's CEO, a low-level ...
 Name: 29, dtype: object,
 title                                    Manorama Six Feet Under
 description    A government employee and aspiring crime write...
 Name: 1945, dtype: object,
 title                                            The Arbitration
 description    An arbitration panel is formed after a company...
 Name: 2882, dtype: object,
 title                                                 First Kiss
 description    A starry-eyed employee of a hot cosmetics comp...
 Name: 3465, dtype: object,
 title                                                 Ex Machina
 description    A coder at a tech company wins a week-long ret...
 Name: 6716, dtype: object]

 Sentiment Analysis on Movie Reviews

In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.corpus import stopwords
import string


In [31]:
def clean_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ''.join(tokens)

In [32]:
obj = SentimentIntensityAnalyzer()

In [33]:
sentence = "Ram is really good"
sentiment_dict = obj.polarity_scores(sentence)
print(sentiment_dict)

{'neg': 0.0, 'neu': 0.484, 'pos': 0.516, 'compound': 0.4927}


In [34]:
sentence = "Rahul is really bad"
sentiment_dict = obj.polarity_scores(sentence)
print(sentiment_dict)

{'neg': 0.558, 'neu': 0.442, 'pos': 0.0, 'compound': -0.5849}


Weather Forecasting Module

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

  from pandas.core import (


In [3]:
df = pd.read_csv('datasets/weatherHistory.csv')  # Replace with your actual file
df.head()
df.shape

(96453, 12)

Data Preprocessing & Cleaning

In [37]:
df = df.dropna()

In [38]:
df['Formatted Date'] = pd.to_datetime(df['Formatted Date'],utc=True)

In [39]:
df['month'] = df['Formatted Date'].dt.month
df['day'] = df['Formatted Date'].dt.day
df['year'] = df['Formatted Date'].dt.year
df['dayofweek'] = df['Formatted Date'].dt.dayofweek

In [40]:
df = df.drop(columns=['Formatted Date'])

In [41]:
# Feature Selection

In [42]:
df.columns

Index(['Summary', 'Precip Type', 'Temperature (C)', 'Apparent Temperature (C)',
       'Humidity', 'Wind Speed (km/h)', 'Wind Bearing (degrees)',
       'Visibility (km)', 'Loud Cover', 'Pressure (millibars)',
       'Daily Summary', 'month', 'day', 'year', 'dayofweek'],
      dtype='object')

In [43]:
X = df.drop('Temperature (C)', axis=1)
y = df['Temperature (C)']

In [44]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [45]:
# Identify categorical columns (e.g. string/object type)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'  # keep other columns as-is (numerical ones)
)

# Full pipeline with model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
model.fit(X_train, y_train)


In [6]:
import pickle

# Save the model to a file
with open('weather_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [8]:
# Load the model from the file
with open('weather_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Use it for prediction
predictions = loaded_model.predict(X_test)

In [46]:
import requests

API_KEY = "Api_key"
CITY = "Delhi"
URL = f"http://api.openweathermap.org/data/2.5/weather?q={CITY}&appid={API_KEY}&units=metric"

response = requests.get(URL)
data = response.json()

# Extract required features
humidity = data['main']['humidity']
pressure = data['main']['pressure']
wind_speed = data['wind']['speed']
temp = data['main']['temp']

print("Current Weather Data:")
print(f"Temperature: {temp}°C")
print(f"Humidity: {humidity}%")
print(f"Pressure: {pressure} hPa")
print(f"Wind Speed: {wind_speed} m/s")

Current Weather Data:
Temperature: 43.1°C
Humidity: 13%
Pressure: 998 hPa
Wind Speed: 5.26 m/s


In [None]:
import numpy as np

# Create input array (reshape to 2D for sklearn)
input_data = np.array([[temp, humidity, pressure, wind_speed]])

In [49]:

import requests
import pickle
import numpy as np
import datetime

# Step 1: Fetch current weather
API_KEY = 'cea962528ea2296e50a65b42c009e86c'
CITY = 'kochi'
URL = f'http://api.openweathermap.org/data/2.5/weather?q={CITY}&appid={API_KEY}&units=metric'

response = requests.get(URL)
data = response.json()

# Step 2: Extract values
temperature = data['main']['temp']
humidity = data['main']['humidity']
pressure = data['main']['pressure']  # in hPa, equivalent to millibars
wind_speed = data['wind']['speed'] * 3.6  # m/s to km/h
wind_bearing = data['wind'].get('deg', 0)  # default to 0 if missing

# Optional: Use defaults or estimate if not provided by API
apparent_temp = temperature  # assume equal if unavailable
visibility = data.get('visibility', 10000) / 1000  # in km
cloud_cover = data['clouds'].get('all', 0) / 100  # convert 0–100 to 0–1 scale

# Categorical fields (convert to strings)
summary = data['weather'][0]['main']  # e.g., 'Clear'
precip_type = 'rain' if 'rain' in data else 'none'
daily_summary = data['weather'][0]['description']

# Date features
today = datetime.datetime.now()
month = today.month
day = today.day
year = today.year
dayofweek = today.weekday()  # Monday=0

# Step 3: Create a DataFrame with a single row
import pandas as pd

input_df = pd.DataFrame([{
    'Summary': summary,
    'Precip Type': precip_type,
#     'Temperature (C)': temperature,
    'Apparent Temperature (C)': apparent_temp,
    'Humidity': humidity,
    'Wind Speed (km/h)': wind_speed,
    'Wind Bearing (degrees)': wind_bearing,
    'Visibility (km)': visibility,
    'Loud Cover': cloud_cover,
    'Pressure (millibars)': pressure,
    'Daily Summary': daily_summary,
    'month': month,
    'day': day,
    'year': year,
    'dayofweek': dayofweek
}])

# Optional: apply same preprocessing (LabelEncoding, OneHot, etc.) if model needs it

# Step 4: Load model and predict
with open('weather_model.pkl', 'rb') as f:
    model = pickle.load(f)

# If your model expects encoded columns, you must preprocess `input_df` exactly like your training data
prediction = model.predict(input_df)

print("Model Prediction:", prediction)

Model Prediction: [26.80338889]


In [None]:
def interpret_weather(temp_c):
    if temp_c >= 30:
        return "Sunny", "It's a scorcher! Time for a summer blockbuster."
    elif 20 <= temp_c < 30:
        return "Clear", "Clear skies ahead — perfect for a feel-good film."
    elif 10 <= temp_c < 20:
        return "Cloudy", "Cloudy - just like a slow-burn mystery thriller!"
    elif 0 <= temp_c < 10:
        return "Chilly", "Chilly vibes call for a cozy rom-com."
    else:
        return "Snowy", "Snowy and serene — maybe a classic drama?"

In [None]:
interpret_weather(prediction)