In [1]:
pip install yfinance pandas numpy scikit-learn matplotlib

Collecting yfinance
  Downloading yfinance-0.2.66-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.6-py311-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.2.tar.gz (949 kB)
     ---------------------------------------- 0.0/949.2 kB ? eta -:--:--
     ------------- ------------------------ 337.9/949.2 kB 7.1 MB/s eta 0:00:01
     ----------------------------------- -- 890.9/949.2 kB 9.5 MB/s eta 0:00:01
     -------------------------------------- 949.2/949.2 kB 8.6 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with 

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
# Example: Fetch Apple's stock data (AAPL)
data = yf.download("AAPL", start="2015-01-01", end="2025-01-01")
print(data.head())

  data = yf.download("AAPL", start="2015-01-01", end="2025-01-01")
[*********************100%***********************]  1 of 1 completed

Price           Close       High        Low       Open     Volume
Ticker           AAPL       AAPL       AAPL       AAPL       AAPL
Date                                                             
2015-01-02  24.261055  24.729278  23.821679  24.718182  212818400
2015-01-05  23.577574  24.110150  23.391173  24.030263  257142000
2015-01-06  23.579792  23.839422  23.218083  23.641926  263188400
2015-01-07  23.910440  24.010298  23.677438  23.788391  160423600
2015-01-08  24.829128  24.886824  24.121246  24.238858  237458000





In [3]:
import yfinance as yf
import pandas as pd
import numpy as np

tickers = ["AAPL", "MSFT", "JNJ", "NVDA", "TSLA", "AMZN", "PG", "XOM", "META", "NFLX"]


def get_stock_data(tickers):
    stocks = []
    for ticker in tickers:
        data = yf.download(ticker, period="1y", interval="1d")
        data["Returns"] = data["Close"].pct_change()
        volatility = np.std(data["Returns"]) * np.sqrt(252)  # annualized volatility
        mean_return = data["Returns"].mean() * 252  # annualized return
        info = yf.Ticker(ticker).info
        stocks.append(
            {
                "Ticker": ticker,
                "MeanReturn": mean_return,
                "Volatility": volatility,
                "Beta": info.get("beta", np.nan),
                "Sector": info.get("sector", "Unknown"),
                "MarketCap": info.get("marketCap", 0),
            }
        )
    return pd.DataFrame(stocks)


df = get_stock_data(tickers)
print(df)

  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, period="1y", interval="1d")
[*********************100%***********************]  1 of 1 co

  Ticker  MeanReturn  Volatility   Beta                  Sector      MarketCap
0   AAPL    0.200645    0.326936  1.094              Technology  3991873257472
1   MSFT    0.281247    0.247430  1.023              Technology  4036352540672
2    JNJ    0.199235    0.192147  0.392              Healthcare   451164602368
3   NVDA    0.448908    0.492667  2.123              Technology  4729648185344
4   TSLA    0.802100    0.676520  2.086       Consumer Cyclical  1544344174592
5   AMZN    0.253407    0.339881  1.281       Consumer Cyclical  2439768178688
6     PG   -0.060310    0.185758  0.364      Consumer Defensive   356842569728
7    XOM    0.033611    0.233361  0.484                  Energy   492213207040
8   META    0.334596    0.365824  1.203  Communication Services  1890022195200
9   NFLX    0.441130    0.327753  1.593  Communication Services   465936056320


In [4]:
def categorize_risk(row):
    if row["Volatility"] < 0.25:
        return "Low"
    elif row["Volatility"] < 0.40:
        return "Medium"
    else:
        return "High"


df["RiskCategory"] = df.apply(categorize_risk, axis=1)
print(df[["Ticker", "Volatility", "RiskCategory"]])

  Ticker  Volatility RiskCategory
0   AAPL    0.326936       Medium
1   MSFT    0.247430          Low
2    JNJ    0.192147          Low
3   NVDA    0.492667         High
4   TSLA    0.676520         High
5   AMZN    0.339881       Medium
6     PG    0.185758          Low
7    XOM    0.233361          Low
8   META    0.365824       Medium
9   NFLX    0.327753       Medium


In [5]:
profiles = []

age_groups = ["18-25", "25-35", "35-50", "50+"]
horizons = ["Short Term", "Medium Term", "Long Term"]
experience = ["Beginner", "Intermediate", "Advanced"]
incomes = ["<5L", "5-10L", "10-20L", "20L+"]
risk_tolerance = ["Low", "Medium", "High"]

import random

for _ in range(500):  # 500 synthetic samples
    profile = {
        "AgeGroup": random.choice(age_groups),
        "Horizon": random.choice(horizons),
        "Experience": random.choice(experience),
        "Income": random.choice(incomes),
        "RiskTolerance": random.choice(risk_tolerance),
    }
    # Map user risk to stock risk
    profile["RecommendedRisk"] = profile["RiskTolerance"]
    # Randomly assign one stock from that risk category
    stock = df[df["RiskCategory"] == profile["RecommendedRisk"]].sample(1)
    profile["Ticker"] = stock["Ticker"].values[0]
    profiles.append(profile)

train_df = pd.DataFrame(profiles)
print(train_df.head())

  AgeGroup      Horizon    Experience Income RiskTolerance RecommendedRisk  \
0      50+  Medium Term      Beginner    <5L           Low             Low   
1    18-25    Long Term      Advanced  5-10L        Medium          Medium   
2      50+   Short Term      Advanced   20L+          High            High   
3    18-25   Short Term      Beginner   20L+          High            High   
4    35-50    Long Term  Intermediate    <5L           Low             Low   

  Ticker  
0     PG  
1   AAPL  
2   TSLA  
3   TSLA  
4   MSFT  


In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

encoders = {}
for col in ["AgeGroup", "Horizon", "Experience", "Income", "RiskTolerance", "Ticker"]:
    enc = LabelEncoder()
    train_df[col] = enc.fit_transform(train_df[col])
    encoders[col] = enc

X = train_df[["AgeGroup", "Horizon", "Experience", "Income", "RiskTolerance"]]
y = train_df["Ticker"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.33


In [7]:
new_user = {
    "AgeGroup": "25-35",
    "Horizon": "Long Term",
    "Experience": "Intermediate",
    "Income": "10-20L",
    "RiskTolerance": "Medium",
}

# Encode input
input_df = pd.DataFrame([new_user])
for col in input_df.columns:
    input_df[col] = encoders[col].transform(input_df[col])

pred_ticker_encoded = model.predict(input_df)[0]
pred_ticker = encoders["Ticker"].inverse_transform([pred_ticker_encoded])[0]

print(f"Recommended Stock: {pred_ticker}")

Recommended Stock: AAPL


In [8]:
probs = model.predict_proba(input_df)[0]
top_indices = probs.argsort()[-6:][::-1]  # top 6
top_tickers = encoders["Ticker"].inverse_transform(top_indices)

print("\nTop 6 Stocks Suggested by Model:")
for ticker, prob in zip(top_tickers, probs[top_indices]):
    print(f"{ticker}  —  Confidence: {prob:.2f}")


Top 6 Stocks Suggested by Model:
AAPL  —  Confidence: 0.70
AMZN  —  Confidence: 0.13
META  —  Confidence: 0.11
MSFT  —  Confidence: 0.05
PG  —  Confidence: 0.01
XOM  —  Confidence: 0.00


In [12]:
import numpy as np

# assume your trained model is called `model`
# (no need to reload it!)

print("\n=== Investor Profile Quiz ===")
age_group = input("Age Group (18-25 / 26-40 / 41-60 / 60+): ").strip()
investment_horizon = input(
    "Investment Horizon (Short-term / Medium-term / Long-term): "
).strip()
financial_experience = input(
    "Financial Experience (Beginner / Intermediate / Expert): "
).strip()
annual_income = input("Annual Income (Below 5L / 5L-10L / 10L-25L / 25L+): ").strip()
risk_tolerance = input("Risk Tolerance (Low / Medium / High): ").strip()

# same encoding logic you used during training
mapping = {
    "Low": 0,
    "Medium": 1,
    "High": 2,
    "Beginner": 0,
    "Intermediate": 1,
    "Expert": 2,
    "Short-term": 0,
    "Medium-term": 1,
    "Long-term": 2,
    "18-25": 0,
    "26-40": 1,
    "41-60": 2,
    "60+": 3,
    "Below 5L": 0,
    "5L-10L": 1,
    "10L-25L": 2,
    "25L+": 3,
}

# build feature vector
features = np.array(
    [
        mapping.get(age_group, 0),
        mapping.get(investment_horizon, 0),
        mapping.get(financial_experience, 0),
        mapping.get(annual_income, 0),
        mapping.get(risk_tolerance, 0),
    ]
).reshape(1, -1)

print("\nEncoded features:", features)

# make a prediction using your already-defined model
prediction = model.predict(features)[0]
print("Model prediction:", prediction)

# map predicted class → stock suggestions
stock_map = {
    0: ["AAPL", "MSFT", "GOOGL", "AMZN", "NVDA"],
    1: ["TCS.NS", "INFY.NS", "HDFCBANK.NS", "RELIANCE.NS", "ICICIBANK.NS"],
    2: ["TSLA", "META", "NFLX", "ADBE", "AMD"],
    3: ["BA", "NKE", "KO", "PEP", "MCD"],
    4: ["JPM", "GS", "BAC", "MS", "V"],
    5: ["UNH", "PFE", "JNJ", "MRK", "LLY"],
    6: ["XOM", "CVX", "BP", "SHEL", "TOT"],
    7: ["PG", "COST", "WMT", "TGT", "HD"],
}

try:
    pred_class = int(prediction)
except:
    pred_class = 0

suggested = stock_map.get(pred_class, stock_map[0])

print("\n=== Suggested Stocks ===")
for s in suggested:
    print("-", s)


=== Investor Profile Quiz ===

Encoded features: [[3 2 0 1 1]]
Model prediction: 2

=== Suggested Stocks ===
- TSLA
- META
- NFLX
- ADBE
- AMD




In [11]:
import numpy as np

# assume your trained model variable is called `model`
# (no need to reload or retrain)

print("\n=== Investor Profile Quiz ===")
age_group = input("Age Group (18-25 / 26-40 / 41-60 / 60+): ").strip()
investment_horizon = input(
    "Investment Horizon (Short-term / Medium-term / Long-term): "
).strip()
financial_experience = input(
    "Financial Experience (Beginner / Intermediate / Expert): "
).strip()
annual_income = input("Annual Income (Below 5L / 5L-10L / 10L-25L / 25L+): ").strip()
risk_tolerance = input("Risk Tolerance (Low / Medium / High): ").strip()

# same encoding logic used during training
mapping = {
    "Low": 0,
    "Medium": 1,
    "High": 2,
    "Beginner": 0,
    "Intermediate": 1,
    "Expert": 2,
    "Short-term": 0,
    "Medium-term": 1,
    "Long-term": 2,
    "18-25": 0,
    "26-40": 1,
    "41-60": 2,
    "60+": 3,
    "Below 5L": 0,
    "5L-10L": 1,
    "10L-25L": 2,
    "25L+": 3,
}

# build feature vector
features = np.array(
    [
        mapping.get(age_group, 0),
        mapping.get(investment_horizon, 0),
        mapping.get(financial_experience, 0),
        mapping.get(annual_income, 0),
        mapping.get(risk_tolerance, 0),
    ]
).reshape(1, -1)

print("\nEncoded features:", features)

# make prediction
prediction = model.predict(features)[0]
print("Model prediction:", prediction)

# 🟣 UPDATED STOCK MAP (Option A)
stock_map = {
    0: ["AAPL", "MSFT", "GOOGL", "AMZN", "NVDA"],  # Tech
    1: [
        "TCS.NS",
        "INFY.NS",
        "HDFCBANK.NS",
        "RELIANCE.NS",
        "ICICIBANK.NS",
    ],  # India blue chips
    2: ["TSLA", "META", "NFLX", "ADBE", "AMD"],  # Growth
    3: ["BA", "NKE", "KO", "PEP", "MCD"],  # Consumer
    4: ["JPM", "GS", "BAC", "MS", "V"],  # Finance
    5: ["UNH", "PFE", "JNJ", "MRK", "LLY"],  # Healthcare
    6: ["XOM", "CVX", "BP", "SHEL", "TOT"],  # Energy
    7: ["PG", "COST", "WMT", "TGT", "HD"],  # Retail
    8: ["INTC", "IBM", "ORCL", "SAP", "CSCO"],  # Enterprise Tech
    9: ["AMAT", "ASML", "LRCX", "TXN", "QCOM"],  # Semiconductor/Hardware
}

# convert prediction safely
try:
    pred_class = int(prediction)
except:
    pred_class = 0

# get corresponding stocks
suggested = stock_map.get(pred_class, stock_map[0])

print("\n=== Suggested Stocks ===")
for s in suggested:
    print("-", s)


=== Investor Profile Quiz ===

Encoded features: [[0 0 2 1 1]]
Model prediction: 4

=== Suggested Stocks ===
- JPM
- GS
- BAC
- MS
- V




In [None]:
# import pickle

# with open("model.pkl", "wb") as f:
#     pickle.dump(model, f)

# with open("encoders.pkl", "wb") as f:
#     pickle.dump(encoders, f)

In [13]:
import joblib

# Save the trained model
joblib.dump(model, "model.pkl")

print("✅ Model saved as model.pkl")

✅ Model saved as model.pkl
