In [3]:
import pandas as pd 

In [9]:
hist=pd.read_csv('C:\\Users\\hp\\Desktop\\SEM 5\\INTERNSHIP\\historical_data.csv')

In [11]:
hist["date"] = pd.to_datetime(hist["Timestamp IST"].str[:10], format="%d-%m-%Y")
hist["Closed PnL"] = hist["Closed PnL"].astype(float)
hist["Size USD"] = hist["Size USD"].astype(float)

In [15]:
fear = pd.read_csv('C:\\Users\\hp\\Desktop\\SEM 5\\INTERNSHIP\\fear_greed_index.csv')
fear["date"] = pd.to_datetime(fear["date"])

In [17]:
#MERGING THE DATASETS
merged = hist.merge(
    fear[["date", "value", "classification"]],
    on="date",
    how="left"
)

# sanity check: almost all trades get a classification
print(merged["classification"].isna().mean())


2.840586297011703e-05


In [19]:
#DATA AGGREGRATION 
daily = merged.groupby("date").agg(
    total_pnl=("Closed PnL", "sum"),
    avg_pnl=("Closed PnL", "mean"),
    n_trades=("Closed PnL", "size")
).reset_index().merge(
    fear[["date", "value", "classification"]],
    on="date",
    how="left"
)

print(daily["value"].corr(daily["total_pnl"]))
print(daily["value"].corr(daily["avg_pnl"]))


-0.08264200129021722
0.037314753868771165


In [21]:
sent_summary = merged.groupby("classification").agg(
    n_trades      = ("Closed PnL", "size"),
    avg_pnl       = ("Closed PnL", "mean"),
    median_pnl    = ("Closed PnL", "median"),
    win_rate      = ("Closed PnL", lambda x: (x > 0).mean()),
    total_pnl     = ("Closed PnL", "sum"),
    avg_size_usd  = ("Size USD", "mean"),
    avg_fee       = ("Fee", "mean"),
).sort_values("n_trades", ascending=False)

print(sent_summary)

                n_trades    avg_pnl  median_pnl  win_rate     total_pnl  \
classification                                                            
Fear               61837  54.290400         0.0  0.420768  3.357155e+06   
Greed              50303  42.743559         0.0  0.384828  2.150129e+06   
Extreme Greed      39992  67.892861         0.0  0.464943  2.715171e+06   
Neutral            37686  34.307718         0.0  0.396991  1.292921e+06   
Extreme Fear       21400  34.537862         0.0  0.370607  7.391102e+05   

                avg_size_usd   avg_fee  
classification                          
Fear             7816.109931  1.495172  
Greed            5736.884375  1.254372  
Extreme Greed    3112.251565  0.675902  
Neutral          4782.732661  1.044798  
Extreme Fear     5349.731843  1.116291  


In [23]:
side_summary = merged.groupby(["classification", "Side"]).agg(
    n_trades = ("Closed PnL", "size"),
    avg_pnl  = ("Closed PnL", "mean"),
    win_rate = ("Closed PnL", lambda x: (x > 0).mean()),
    avg_size_usd = ("Size USD", "mean")
).reset_index()

print(side_summary)


  classification  Side  n_trades     avg_pnl  win_rate  avg_size_usd
0   Extreme Fear   BUY     10935   34.114627  0.201646   5161.502485
1   Extreme Fear  SELL     10465   34.980106  0.547157   5546.414885
2  Extreme Greed   BUY     17940   10.498927  0.311427   3363.034672
3  Extreme Greed  SELL     22052  114.584643  0.589833   2908.231569
4           Fear   BUY     30270   63.927104  0.263000   8154.666208
5           Fear  SELL     31567   45.049641  0.572053   7491.463987
6          Greed   BUY     24576   25.002302  0.318075   6306.490894
7          Greed  SELL     25727   59.691091  0.448595   5192.761477
8        Neutral   BUY     18969   29.227429  0.240023   3881.410441
9        Neutral  SELL     18717   39.456408  0.556072   5696.190011


In [27]:
#FINDING IF THERE IS DAILY CORRELATION
daily = merged.groupby("date").agg(
    total_pnl=("Closed PnL", "sum"),
    avg_pnl=("Closed PnL", "mean"),
    n_trades=("Closed PnL", "size")
).reset_index().merge(
    fear[["date", "value", "classification"]],
    on="date",
    how="left"
)

print("Corr(value, total_pnl):", daily["value"].corr(daily["total_pnl"]))
print("Corr(value, avg_pnl):", daily["value"].corr(daily["avg_pnl"]))


Corr(value, total_pnl): -0.08264200129021722
Corr(value, avg_pnl): 0.037314753868771165


In [33]:
data = merged.copy()
data["is_win"] = (data["Closed PnL"] > 0).astype(int)

# Drop rows with any NaNs in the feature columns
data_model = data[["Side", "Size USD", "value", "classification", "is_win"]].dropna()

X = data_model[["Side", "Size USD", "value", "classification"]]
y = data_model["is_win"]


In [35]:
#THIS IS A SIMPLE PREDICTING MODEL 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

data = merged.copy()
data["is_win"] = (data["Closed PnL"] > 0).astype(int)

X = data[["Side", "Size USD", "value", "classification"]]
y = data["is_win"]

# Define which columns are numeric and categorical
numeric_features = ["Size USD", "value"]
categorical_features = ["Side", "classification"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

clf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=1000))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf.fit(X_train, y_train)

print("Train accuracy:", clf.score(X_train, y_train))
print("Test accuracy:", clf.score(X_test, y_test))


Train accuracy: 0.6262257440273643
Test accuracy: 0.6267013847792638
