In [1]:
pip install yfinance pandas numpy scikit-learn matplotlib

Collecting yfinance
  Downloading yfinance-0.2.66-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.6-py311-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.2.tar.gz (949 kB)
     ---------------------------------------- 0.0/949.2 kB ? eta -:--:--
     ------------- ------------------------ 337.9/949.2 kB 7.1 MB/s eta 0:00:01
     ----------------------------------- -- 890.9/949.2 kB 9.5 MB/s eta 0:00:01
     -------------------------------------- 949.2/949.2 kB 8.6 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with 

In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [3]:
# Example: Fetch Apple's stock data (AAPL)
data = yf.download("AAPL", start="2015-01-01", end="2025-01-01")
print(data.head())

  data = yf.download("AAPL", start="2015-01-01", end="2025-01-01")
[*********************100%***********************]  1 of 1 completed

Price           Close       High        Low       Open     Volume
Ticker           AAPL       AAPL       AAPL       AAPL       AAPL
Date                                                             
2015-01-02  24.261044  24.729267  23.821668  24.718171  212818400
2015-01-05  23.577570  24.110146  23.391169  24.030260  257142000
2015-01-06  23.579798  23.839428  23.218089  23.641931  263188400
2015-01-07  23.910433  24.010290  23.677430  23.788384  160423600
2015-01-08  24.829124  24.886821  24.121242  24.238854  237458000





In [9]:
import yfinance as yf
import pandas as pd
import numpy as np

tickers = ["AAPL", "MSFT", "JNJ", "NVDA", "TSLA", "AMZN", "PG", "XOM", "META", "NFLX"]


def get_stock_data(tickers):
    stocks = []
    for ticker in tickers:
        data = yf.download(ticker, period="1y", interval="1d")
        data["Returns"] = data["Close"].pct_change()
        volatility = np.std(data["Returns"]) * np.sqrt(252)  # annualized volatility
        mean_return = data["Returns"].mean() * 252  # annualized return
        info = yf.Ticker(ticker).info
        stocks.append(
            {
                "Ticker": ticker,
                "MeanReturn": mean_return,
                "Volatility": volatility,
                "Beta": info.get("beta", np.nan),
                "Sector": info.get("sector", "Unknown"),
                "MarketCap": info.get("marketCap", 0),
            }
        )
    return pd.DataFrame(stocks)


df = get_stock_data(tickers)
print(df)

  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 co

  Ticker  MeanReturn  Volatility   Beta                  Sector      MarketCap
0   AAPL    0.200698    0.327592  1.094              Technology  3989245001728
1   MSFT    0.260471    0.247061  1.023              Technology  3950834876416
2    JNJ    0.216203    0.191790  0.392              Healthcare   458488872960
3   NVDA    0.436083    0.493490  2.123              Technology  4662207447040
4   TSLA    0.778589    0.677473  2.086       Consumer Cyclical  1504667107328
5   AMZN    0.246417    0.340491  1.281       Consumer Cyclical  2420615151616
6     PG   -0.067022    0.186011  0.364      Consumer Defensive   355145252864
7    XOM    0.037979    0.233789  0.484                  Energy   494280867840
8   META    0.333870    0.366557  1.203  Communication Services  1886166188032
9   NFLX    0.438242    0.328398  1.593  Communication Services   463800467456


In [10]:
def categorize_risk(row):
    if row["Volatility"] < 0.25:
        return "Low"
    elif row["Volatility"] < 0.40:
        return "Medium"
    else:
        return "High"


df["RiskCategory"] = df.apply(categorize_risk, axis=1)
print(df[["Ticker", "Volatility", "RiskCategory"]])

  Ticker  Volatility RiskCategory
0   AAPL    0.327592       Medium
1   MSFT    0.247061          Low
2    JNJ    0.191790          Low
3   NVDA    0.493490         High
4   TSLA    0.677473         High
5   AMZN    0.340491       Medium
6     PG    0.186011          Low
7    XOM    0.233789          Low
8   META    0.366557       Medium
9   NFLX    0.328398       Medium


In [11]:
profiles = []

age_groups = ["18-25", "25-35", "35-50", "50+"]
horizons = ["Short Term", "Medium Term", "Long Term"]
experience = ["Beginner", "Intermediate", "Advanced"]
incomes = ["<5L", "5-10L", "10-20L", "20L+"]
risk_tolerance = ["Low", "Medium", "High"]

import random

for _ in range(500):  # 500 synthetic samples
    profile = {
        "AgeGroup": random.choice(age_groups),
        "Horizon": random.choice(horizons),
        "Experience": random.choice(experience),
        "Income": random.choice(incomes),
        "RiskTolerance": random.choice(risk_tolerance),
    }
    # Map user risk to stock risk
    profile["RecommendedRisk"] = profile["RiskTolerance"]
    # Randomly assign one stock from that risk category
    stock = df[df["RiskCategory"] == profile["RecommendedRisk"]].sample(1)
    profile["Ticker"] = stock["Ticker"].values[0]
    profiles.append(profile)

train_df = pd.DataFrame(profiles)
print(train_df.head())

  AgeGroup      Horizon    Experience  Income RiskTolerance RecommendedRisk  \
0    25-35   Short Term  Intermediate    20L+          High            High   
1    18-25  Medium Term      Advanced  10-20L           Low             Low   
2    25-35   Short Term      Beginner  10-20L           Low             Low   
3    18-25  Medium Term      Beginner   5-10L        Medium          Medium   
4      50+    Long Term      Beginner   5-10L           Low             Low   

  Ticker  
0   TSLA  
1     PG  
2   MSFT  
3   META  
4    JNJ  


In [12]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

encoders = {}
for col in ["AgeGroup", "Horizon", "Experience", "Income", "RiskTolerance", "Ticker"]:
    enc = LabelEncoder()
    train_df[col] = enc.fit_transform(train_df[col])
    encoders[col] = enc

X = train_df[["AgeGroup", "Horizon", "Experience", "Income", "RiskTolerance"]]
y = train_df["Ticker"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.35


In [13]:
new_user = {
    "AgeGroup": "25-35",
    "Horizon": "Long Term",
    "Experience": "Intermediate",
    "Income": "10-20L",
    "RiskTolerance": "Medium",
}

# Encode input
input_df = pd.DataFrame([new_user])
for col in input_df.columns:
    input_df[col] = encoders[col].transform(input_df[col])

pred_ticker_encoded = model.predict(input_df)[0]
pred_ticker = encoders["Ticker"].inverse_transform([pred_ticker_encoded])[0]

print(f"Recommended Stock: {pred_ticker}")

Recommended Stock: META


In [14]:
probs = model.predict_proba(input_df)[0]
top_indices = probs.argsort()[-6:][::-1]  # top 6
top_tickers = encoders["Ticker"].inverse_transform(top_indices)

print("\nTop 6 Stocks Suggested by Model:")
for ticker, prob in zip(top_tickers, probs[top_indices]):
    print(f"{ticker}  —  Confidence: {prob:.2f}")


Top 6 Stocks Suggested by Model:
META  —  Confidence: 0.64
AMZN  —  Confidence: 0.16
AAPL  —  Confidence: 0.09
NFLX  —  Confidence: 0.07
PG  —  Confidence: 0.02
JNJ  —  Confidence: 0.01


In [15]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)