In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime
import re


In [85]:
rolling_window = 5
commentDF = pd.read_csv('/content/comment_analysis_Indian_Stock_Market_NSE.csv')
commentDF['Rolling_Vader_Pos'] = commentDF['Vader Pos'].rolling(window=rolling_window).mean()
commentDF['Rolling_Vader_Neg'] = commentDF['Vader Neg'].rolling(window=rolling_window).mean()
commentDF['Rolling_TextBlob_Pos'] = commentDF['textblob Positive'].rolling(window=rolling_window).mean()
commentDF['Rolling_TextBlob_Neg'] = commentDF['textblob Negative'].rolling(window=rolling_window).mean()

# Fill NA values resulting from rolling calculation
commentDF.fillna(0, inplace=True)

In [86]:
commentDF['Weighted_Vader_Pos'] = commentDF['Vader Pos'] * commentDF['NumberOfTickerMentions']
commentDF['Weighted_Vader_Neg'] = commentDF['Vader Neg'] * commentDF['NumberOfTickerMentions']
commentDF['Weighted_TextBlob_Pos'] = commentDF['textblob Positive'] * commentDF['NumberOfTickerMentions']
commentDF['Weighted_TextBlob_Neg'] = commentDF['textblob Negative'] * commentDF['NumberOfTickerMentions']

# Optional: Normalize by dividing by the total mentions in the same window
commentDF['Normalized_Weighted_Vader_Pos'] = commentDF['Weighted_Vader_Pos'] / (commentDF['NumberOfTickerMentions'] + 1e-5)
commentDF['Normalized_Weighted_Vader_Neg'] = commentDF['Weighted_Vader_Neg'] / (commentDF['NumberOfTickerMentions'] + 1e-5)

In [87]:
commentDF['Combined_Sentiment_Pos'] = commentDF['Rolling_Vader_Pos'] + commentDF['Normalized_Weighted_Vader_Pos']
commentDF['Combined_Sentiment_Neg'] = commentDF['Rolling_Vader_Neg'] + commentDF['Normalized_Weighted_Vader_Neg']


In [88]:
commentDF.to_csv('comment_analysis_with_sentiments.csv', index=False)

In [288]:
comment_df = pd.read_csv('general.csv')
stock_df = pd.read_csv('/content/stockhistory_^NSEI.csv')

In [289]:
print("Comment Data Columns:", comment_df.columns)
print("Stock Data Columns:", stock_df.columns)
comment_df['Date'] = pd.to_datetime(comment_df['Date'], errors='coerce')
stock_df['Date'] = pd.to_datetime(stock_df['Date'], errors='coerce')
print("Missing values in comment_df 'Date':", comment_df['Date'].isnull().sum())
print("Missing values in stock_df 'Date':", stock_df['Date'].isnull().sum())
stock_df = stock_df.dropna(subset=['Date'])

Comment Data Columns: Index(['Title', 'Ticker', 'Date', 'NumberOfTickerMentions', 'Vader Neg',
       'Vader Pos', 'Vader Neut', 'textblob Negative', 'textblob Positive',
       'textblob Neut', 'Rolling_Vader_Pos', 'Rolling_Vader_Neg',
       'Rolling_TextBlob_Pos', 'Rolling_TextBlob_Neg', 'Weighted_Vader_Pos',
       'Weighted_Vader_Neg', 'Weighted_TextBlob_Pos', 'Weighted_TextBlob_Neg',
       'Normalized_Weighted_Vader_Pos', 'Normalized_Weighted_Vader_Neg',
       'Combined_Sentiment_Pos', 'Combined_Sentiment_Neg'],
      dtype='object')
Stock Data Columns: Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits'],
      dtype='object')
Missing values in comment_df 'Date': 0
Missing values in stock_df 'Date': 0


In [290]:
invalid_dates = comment_df[~comment_df['Date'].apply(pd.to_datetime, errors='coerce').notna()]
print(invalid_dates)

Empty DataFrame
Columns: [Title, Ticker, Date, NumberOfTickerMentions, Vader Neg, Vader Pos, Vader Neut, textblob Negative, textblob Positive, textblob Neut, Rolling_Vader_Pos, Rolling_Vader_Neg, Rolling_TextBlob_Pos, Rolling_TextBlob_Neg, Weighted_Vader_Pos, Weighted_Vader_Neg, Weighted_TextBlob_Pos, Weighted_TextBlob_Neg, Normalized_Weighted_Vader_Pos, Normalized_Weighted_Vader_Neg, Combined_Sentiment_Pos, Combined_Sentiment_Neg]
Index: []

[0 rows x 22 columns]


In [274]:


# Define function to check if the comment contains the stock ticker
def contains_ticker(comment, ticker_keywords):
    # Regex to match ticker symbol in the comment based on ticker_keywords
    pattern = rf"\b(?:{'|'.join(ticker_keywords)})\b"
    return bool(re.search(pattern, comment, re.IGNORECASE))

# Filter comments by the ticker keywords
def filter_comments_by_ticker(comment_df, ticker_keywords):
    # Use boolean indexing to filter the DataFrame directly for keywords
    relevant_comments = comment_df[comment_df['Title'].str.contains(rf"\b(?:{'|'.join(ticker_keywords)})\b", case=False, na=False)]
    return relevant_comments

# Merge the comment dataframe with stock data based on Date
def merge_sentiment_and_stock_data(comment_df, stock_df):
    # Ensure 'Date' columns in both dataframes are datetime objects
    # Convert Date columns to datetime in both dataframes
    comment_df['Date'] = pd.to_datetime(comment_df['Date'], errors='coerce').dt.tz_localize(None)
    stock_df = stock_df.reset_index()
    stock_df['Date'] = pd.to_datetime(stock_df['Date'], errors='coerce').dt.tz_localize(None)
    # Merge filtered comments with stock price data (using 'Date')
    final_df = pd.merge(comment_df, stock_df, on='Date', how='inner')
    return final_df

# Function to process and merge the data
def process_data():
    # Define the stock ticker and keywords you're interested in
    ticker = 'ZOMATO.NS'  # Replace with the correct ticker format
    ticker_keywords = ['ZOMATO']  # List of keywords to match in comments

    # Load your comment data
    comment_df = pd.read_csv('comment_analysis_with_sentiments.csv')

    # Load stock price data (e.g., from Yahoo Finance for ticker 'ADANIENT.NSE')
    # Get stock data for the last 6 months

    if stock_df.empty:
        print(f"No stock data available for {ticker}. Please check the ticker symbol or try another period.")
        return

    # Filter comments by the target ticker keywords
    relevant_comment_df = filter_comments_by_ticker(comment_df, ticker_keywords)

    # Check if relevant_comment_df is empty
    if relevant_comment_df.empty:
        print(f"No comments found for ticker: {ticker}")
        return

    # Merge the filtered comment data with stock price data based on the 'Date' column
    final_df = merge_sentiment_and_stock_data(relevant_comment_df, stock_df)

    # Save the merged dataframe to a CSV file
    final_df.to_csv(f'merged_data_{ticker}.csv', index=False)

    # Display the merged dataframe
    print(final_df)

# Run the process_data function
process_data()


                                               Title Ticker       Date  \
0                          Zomato doubled my money 🥹    NSE 2023-11-08   
1             Zomato- Blinkit - Things don't add up.    NSE 2024-09-27   
2                             Zomato Investors today    NSE 2022-08-02   
3  Why market fell today? Why is Zomato falling? ...    NSE 2024-05-27   
4  Zomato, swiggy hike platform fee by 20% to Rs ...    NSE 2024-07-15   
5       What do you think will happen to Zomato now?    NSE 2022-08-22   
6  Sell shovels during a Gold rush. Pudumjee Pape...    NSE 2024-12-02   

   NumberOfTickerMentions  Vader Neg  Vader Pos  Vader Neut  \
0                       1         12         39          47   
1                      32          9         21          33   
2                      47          8          8          22   
3                      68          5         15          34   
4                      74         16         38          54   
5                      81    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comment_df['Date'] = pd.to_datetime(comment_df['Date'], errors='coerce').dt.tz_localize(None)


In [296]:
import pandas as pd
import glob


def merge_all_csvs(output_file, sort_by="Date"):
    """
    Merges all CSV files matching 'merged_data_*.csv' into a single file and sorts by date.

    Args:
        output_file (str): Path to save the final merged and sorted CSV file.
        sort_by (str): Column to sort by. Default is 'Date'.
    """
    try:
        # Get all merged_data_*.csv files
        csv_files = glob.glob("/content/merged_data_*.csv")

        if not csv_files:
            print("No CSV files found matching 'merged_data_*.csv'")
            return

        # Read and combine all CSV files
        combined_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

        # Ensure the sort column is in datetime format if it's a date
        if sort_by in combined_df.columns:
            combined_df[sort_by] = pd.to_datetime(combined_df[sort_by], errors='coerce')

        # Sort the combined DataFrame by the Date column
        sorted_df = combined_df.sort_values(by=sort_by)
        sorted_df.drop_duplicates(inplace=True)

        # Save the merged and sorted DataFrame to a new CSV
        sorted_df.to_csv(output_file, index=False)
        print(f"All CSVs have been merged and saved to: {output_file}")
        print(sorted_df.head())  # Print first few rows for verification

    except Exception as e:
        print(f"An error occurred: {e}")


# Example usage
merge_all_csvs("merged_data_all.csv", sort_by="Date")


All CSVs have been merged and saved to: merged_data_all.csv
                                   Title Ticker       Date  \
475                Meme of the day 12/02    NSE 2021-02-12   
300                              Stonks!    NSE 2021-04-09   
49   Daily Story but today was different    NSE 2021-04-12   
185       Shared by a friend. Relate max    NSE 2021-05-10   
357                   Bitcoiners to Elon    NSE 2021-05-17   

    NumberOfTickerMentions Vader Neg Vader Pos Vader Neut textblob Negative  \
475                     68         0        15         15                 1   
300                     59         0         0          5                 0   
49                       5         5         3         14                 7   
185                     36         3         4         11                 3   
357                     60         1         3          8                 3   

    textblob Positive textblob Neut  ...  Combined_Sentiment_Pos  \
475                11   

  combined_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)


In [None]:
# Load the merged data
merged_data = pd.read_csv('merged_data_all.csv')

# Load the original comment data
comment_data = pd.read_csv('comment_analysis_with_sentiments.csv')

# Find comments not present in the merged data
# Assuming 'Title' is the unique identifier in both DataFrames
comments_not_in_merged = comment_data[~comment_data['Title'].isin(merged_data['Title'])]

# Save the comments not in merged data to a new CSV file
comments_not_in_merged.to_csv('general.csv', index=False)

In [2]:
# For the General Comments
import pandas as pd
import yfinance as yf
from datetime import datetime

# Merge the comment dataframe with stock data based on Date
def merge_sentiment_and_stock_data(comment_df, stock_df):
    # Ensure 'Date' columns in both dataframes are datetime objects
    # Convert Date columns to datetime in both dataframes
    comment_df['Date'] = pd.to_datetime(comment_df['Date'], errors='coerce').dt.tz_localize(None)
    stock_df = stock_df.reset_index()
    stock_df['Date'] = pd.to_datetime(stock_df['Date'], errors='coerce').dt.tz_localize(None)
    # Merge filtered comments with stock price data (using 'Date')
    final_df = pd.merge(comment_df, stock_df, on='Date', how='inner')
    return final_df

# Function to process and merge the data
def process_data():
    # Define the stock ticker for NIFTY 50
    ticker = '^NSEI'  # Ticker for NIFTY 50

    # Load your comment data
    comment_df = pd.read_csv('general.csv')

    # Fetch stock price data from Yahoo Finance
    # Last 1 year of data

    # Check if stock data is empty
    if stock_df.empty:
        print(f"No stock data available for {ticker}. Please check the ticker symbol or try another period.")
        return

    # Merge the comment data with stock price data based on the 'Date' column
    final_df = merge_sentiment_and_stock_data(comment_df, stock_df)

    # Save the merged dataframe to a CSV file
    output_file = f'merged_data_{ticker}.csv'
    final_df.to_csv(output_file, index=False)

    # Display the merged dataframe
    print(f"Merged data saved to {output_file}")
    print(final_df)

# Run the process_data function
process_data()


FileNotFoundError: [Errno 2] No such file or directory: 'general.csv'