In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [2]:
!pip install tabulate



In [6]:
from tabulate import tabulate
file_path = "//content//human_chat.csv"
try:

    df = pd.read_csv(file_path, sep='\t', header=None, engine='python')
    print("Dataset loaded successfully!")
    if 0 in df.columns:
        df.rename(columns={0: 'response'}, inplace=True)
        # Keep only the relevant 'response' column and drop any empty ones
        df = df[['response']].copy()
    else:
        print("Error: Could not find the text column after loading with tab separator.")
        # Handle case where the separator might be something else if needed

except pd.errors.ParserError as e:
    print(f"Still encountering a ParserError: {e}")
    print("Try a different separator, e.g., 'sep=None' or 'sep=r'\s+' (whitespace)")
    # If '\t' doesn't work, try reading with 'sep=None', which uses an internal mechanism
    df = pd.read_csv(file_path, header=None, engine='python', sep=None)
    df.rename(columns={0: 'response'}, inplace=True)
    df = df[['response']].copy()


print(df.head())


sia = SentimentIntensityAnalyzer()
def analyze_sentiment(text):
    if pd.isna(text):
        return "Neutral"

    # 1. Ensure text is a string and strip whitespace/control characters
    text = str(text).strip()
    try:
        tokens = word_tokenize(text.lower(), language='english')
    except LookupError:
        tokens = text.lower().split()

    # Stop word removal (rest of the function is fine)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

    # Rejoin cleaned words
    clean_text = " ".join(filtered_tokens)

    # Sentiment analysis
    sia = SentimentIntensityAnalyzer() # Re-initialize if not globally available,
                                       # but assume it is from the global scope
    sentiment_score = sia.polarity_scores(clean_text)
    compound = sentiment_score['compound']

    if compound >= 0.05:
        return "Positive"
    elif compound <= -0.05:
        return "Negative"
    else:
        return "Neutral"

df['Sentiment'] = df['response'].apply(analyze_sentiment)

print("\nSentiment Analysis Results:")

pd.set_option('display.max_colwidth', 40)
result = df[['response', 'Sentiment']].head(20)

table_output = tabulate(
    result,
    headers="keys",
    tablefmt="fancy_grid",
    showindex=False
)
print(table_output)

  print("Try a different separator, e.g., 'sep=None' or 'sep=r'\s+' (whitespace)")


Dataset loaded successfully!
                                            response
0                                       Human 1: Hi!
1            Human 2: What is your favorite holiday?
2  Human 1: one where I get to meet lots of diffe...
3  Human 2: What was the most number of people yo...
4           Human 1: Hard to keep a count. Maybe 25.

Sentiment Analysis Results:
╒═══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════