In [None]:
!pip install pygal
!pip install seaborn
!pip install squarify

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go


import pygal as py
import squarify as sq
import matplotlib
plt.rcParams["figure.figsize"] = (6,4)
matplotlib.rc('xtick', labelsize=8)
matplotlib.rc('ytick', labelsize=8)

font = {'family' : 'sans-serif',
        'weight' : 'bold',
        'size'   : 8}

matplotlib.rc('font', **font)
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df=pd.read_csv("/content/drive/MyDrive/Dataset/all-data.csv",engine="python",encoding="ISO-8859-1")
df

In [None]:
col1=df.keys()[0]
col2=df.keys()[1]
col2

In [None]:
df2=pd.DataFrame([[col1, col2]], columns=list([col1,col2]), index=[4845])

In [None]:
df = pd.concat([df, df2], ignore_index=True)
df = df.set_axis(['sentiment', 'news'], axis=1)

In [None]:
df

In [None]:
sns.countplot(y="sentiment",data=df)

In [None]:
df.isnull().sum()

In [None]:
from textblob import TextBlob

In [None]:
def preprocess(ReviewText):
    ReviewText = ReviewText.str.replace("(<br/>)", "")
    ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
    ReviewText = ReviewText.str.replace('(&amp)', '')
    ReviewText = ReviewText.str.replace('(&gt)', '')
    ReviewText = ReviewText.str.replace('(&lt)', '')
    ReviewText = ReviewText.str.replace('(\xa0)', ' ')
    return ReviewText
df['Review Text'] = preprocess(df['news'])

df['polarity'] = df['news'].map(lambda text: TextBlob(text).sentiment.polarity)
df['news_len'] = df['news'].astype(str).apply(len)
df['word_count'] = df['news'].apply(lambda x: len(str(x).split()))

In [None]:
df

In [None]:
print('top 4 random reviews with the highest positive sentiment polarity: \n')

df1=df.drop_duplicates(subset=['Review Text'])

cl = df1.loc[df1.polarity == 1, ['Review Text']].sample(4).values
for c in cl:
    print(c[0])


In [None]:
print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
cl1 = df.loc[df.polarity == 0, ['Review Text']].sample(5).values
for c in cl1:
    print(c[0])

In [None]:
print('5 reviews with the most negative polarity having polarity lesser than -0.80: \n')
cl3 = df.loc[df.polarity <= -0.80, ['Review Text']].sample(5).values
for c in cl3:
    print(c[0])

In [None]:
sns.boxplot(x="polarity", palette="rainbow", data=df)

In [None]:
df['polarity'].plot(
    kind='hist',
    bins=50,
    color="peru",
    title='Sentiment Polarity Distribution');plt.show()

In [None]:
p_s=df[df["polarity"]>0].count()["sentiment"]
neu_s=df[df["polarity"]==0].count()["sentiment"]
neg_s=df[df["polarity"]<0].count()["sentiment"]

In [None]:
# Setting labels for items in Chart
sentiment = ['positive_sentiment',"neutral_sentiment","negative_sentiment"]

# Setting size in Chart based on
# given values
values = [p_s,neu_s,neg_s]

# colors
colors = ['#FF0000', 'olive', '#FFFF00']
# explosion
explode = (0.05, 0.05, 0.05)

# Pie Chart
plt.pie(values, colors=colors, labels=sentiment,
        autopct='%1.1f%%', pctdistance=0.85,
        explode=explode)

# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()

# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)

# Adding Title of chart
plt.title('count of polarity as per sentiment')

# Displaing Chart
plt.show()

In [None]:
df.plot.box(y=["word_count"],color="hotpink")

In [None]:
df['word_count'].plot(
    kind='hist',
    bins=100,
    color="orange",
    title='Review Text Word Count Distribution');plt.show()

In [None]:
df['news_len'].plot(
    kind='hist',
    bins=50,
    color="lightblue",
    title='Review Text Word Count Distribution');plt.show()

In [None]:
fig = px.scatter(df, x="news_len", y="word_count", color="sentiment",
                 marginal_x="box", marginal_y="violin",
                  title="Click on the legend items!")
fig.show()

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df['Review Text'], 20)
for word, freq in common_words:
    print(word, freq)
df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
    kind='bar',title='Top 20 words in review before removing stop words')
df1

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(df['Review Text'], 20)
for word, freq in common_words:
    print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Top 20 words in review after removing stop words')

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df['Review Text'], 20)
for word, freq in common_words:
    print(word, freq)
df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
    kind='bar',title='Top 20 bigrams in review before removing stop words')


In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df['Review Text'], 20)
for word, freq in common_words:
    print(word, freq)
df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
    kind='bar', title='Top 20 bigrams in review after removing stop words')


In [None]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df['Review Text'], 20)
for word, freq in common_words:
    print(word, freq)
df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
    kind='bar', title='Top 20 trigrams in review before removing stop words')

In [None]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(df['Review Text'], 20)
for word, freq in common_words:
    print(word, freq)
df6 = pd.DataFrame(common_words, columns = ['ReviewText' ,'count'])
df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
    kind='bar', title='Top 20 trigrams in review after removing stop words')


In [None]:
y0 = df.loc[df['sentiment'] == 'positive']['polarity']
y1 = df.loc[df['sentiment'] == 'negative']['polarity']
y2 = df.loc[df['sentiment'] == 'neutral']['polarity']

trace0 = go.Box(
    y=y0,
    name = 'positive',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)
trace1 = go.Box(
    y=y1,
    name = 'negative',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)
trace2 = go.Box(
    y=y2,
    name = 'neutral',
    marker = dict(
        color = 'rgb(10, 140, 208)',
    )
)
data = [trace0, trace1, trace2]
layout = go.Layout(
    title = "Polarity Boxplot according to sentiment"
)

go.Figure(data=data,layout=layout)

In [None]:
y0 = df.loc[df['sentiment'] == 'positive']['news_len']
y1 = df.loc[df['sentiment'] == 'negative']['news_len']
y2 = df.loc[df['sentiment'] == 'neutral']['news_len']


trace0 = go.Box(
    y=y0,
    name = 'positive',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)
trace1 = go.Box(
    y=y1,
    name = 'negative',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)
trace2 = go.Box(
    y=y2,
    name = 'neutral',
    marker = dict(
        color = 'rgb(10, 140, 208)',
    )
)
data = [trace0, trace1, trace2]
layout = go.Layout(
    title = "news length Boxplot by sentiment"
)
go.Figure(data=data,layout=layout)

In [None]:
xp = df.loc[df['sentiment'] == "positive", 'polarity']
xneu = df.loc[df['sentiment'] == "neutral", 'polarity']
xneg= df.loc[df['sentiment'] == "negative", 'polarity']

trace1 = go.Histogram(
    x=xp, name='positive',
    opacity=0.75
)
trace2 = go.Histogram(
    x=xneu, name = 'neutral',
    opacity=0.75
)
trace3 = go.Histogram(
    x=xneg, name = 'negative',
    opacity=0.75
)
data = [trace1, trace2,trace3]
layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity')
go.Figure(data=data, layout=layout)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from textblob import TextBlob

# Load dataset from the absolute path
file_path = '/content/drive/MyDrive/Dataset/all-data.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Renaming columns for clarity (modify based on actual column names)
data.columns = ['sentiment', 'news']

# Check for missing values and fill them
data = data.dropna()

# Feature Engineering: Calculate Polarity, News Length, and Word Count
data['polarity'] = data['news'].apply(lambda x: TextBlob(x).sentiment.polarity)
data['news_len'] = data['news'].apply(len)
data['word_count'] = data['news'].apply(lambda x: len(x.split()))

# Encode the sentiment labels
le = LabelEncoder()
data['sentiment_encoded'] = le.fit_transform(data['sentiment'])

# Vectorize the news text using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X_text = vectorizer.fit_transform(data['news']).toarray()

# Combine text features (TF-IDF) with numerical features (polarity, news_len, word_count)
X = np.hstack((X_text, data[['polarity', 'news_len', 'word_count']].values))
y = data['sentiment_encoded']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


# --- REGRESSION MODEL ---
# Train a regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predictions and evaluation for regression
y_pred_reg = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred_reg)
print(f"Mean Squared Error (Regression): {mse}")

# --- DECISION TREE MODEL ---
# Train Decision Tree classifier
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

# Predictions and evaluation for Decision Tree
y_pred_tree = tree.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print(f"Accuracy (Decision Tree): {accuracy_tree}")

In [None]:
# --- KNN MODEL ---
# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predictions and evaluation for KNN
y_pred_knn = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"Accuracy (KNN): {accuracy_knn}")

# Confusion Matrix for KNN
cm_knn = confusion_matrix(y_test, y_pred_knn)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_knn, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix (KNN)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Decision Tree

# %%
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Train Decision Tree model
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Make predictions
y_pred_tree = decision_tree.predict(X_test)

# Print classification report and confusion matrix
print("Decision Tree Classification Report:\n",
      classification_report(y_test, y_pred_tree))
print("Decision Tree Confusion Matrix:\n",
      confusion_matrix(y_test, y_pred_tree))
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
import numpy as np


if hasattr(X_train, 'tocsr'):
    feature_names = [f'feature_{i}' for i in range(X_train.shape[1])]
else:
    feature_names = X_train.columns if hasattr(X_train, 'columns') else None

class_names = [str(name) for name in decision_tree.classes_]

y
plt.figure(figsize=(30, 15))

# Plot the decision tree
plot_tree(
    decision_tree,
    feature_names=feature_names,
    class_names=class_names,
    filled=True,
    rounded=True,
    fontsize=15,
    max_depth=3,
    proportion=True,
    precision=2,
    impurity=True,
    label='all'
)

plt.title('Decision Tree Structure', fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
!pip install gradio

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pickle
import gradio as gr

In [6]:
# Load your dataset (replace 'your_file.csv' with your actual file path)
data = pd.read_csv('/content/drive/MyDrive/Dataset/all-data.csv', encoding='ISO-8859-1')

In [None]:

# Rename columns for clarity
data.columns = ['sentiment', 'text']

# Encode the sentiment labels into numerical values
label_encoder = LabelEncoder()
data['sentiment_encoded'] = label_encoder.fit_transform(data['sentiment'])

# Split data into training and testing sets
X = data['text']  # Text data (independent variable)
y = data['sentiment_encoded']  # Sentiment (dependent variable)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train a regression model (Linear Regression)
regressor = LinearRegression()
regressor.fit(X_train_tfidf, y_train)

In [None]:
# Save the trained model and the vectorizer as .pkl files
with open('your_model.pkl', 'wb') as model_file:
    pickle.dump(regressor, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf, vectorizer_file)

print("Model and vectorizer saved as 'your_model.pkl' and 'tfidf_vectorizer.pkl'")

In [None]:
# Evaluate the model accuracy
y_pred = regressor.predict(X_test_tfidf)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

max_value = 100  # Assuming sentiment is on a 0-100 scale
accuracy = (1 - rmse / max_value) * 100

#print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
#print(f"R² Score: {r2}")
print(f"Final Accuracy: {accuracy:.2f}%")

In [10]:
# Function to predict market movement based on news title
def predict_market_movement(news_title):
    if not news_title:
        return "Please enter a news title."

    # Load the trained model and TF-IDF vectorizer
    with open('your_model.pkl', 'rb') as model_file:
        regressor = pickle.load(model_file)

    with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
        tfidf = pickle.load(vectorizer_file)

    # Vectorize the input news title and predict
    news_title_tfidf = tfidf.transform([news_title])
    predicted_sentiment = regressor.predict(news_title_tfidf)[0]

    # Determine the market sentiment label
    if predicted_sentiment > 1.5:
        sentiment_label = "Positive Market Movement"
    elif predicted_sentiment < 1.5 and predicted_sentiment > 0.5:
        sentiment_label = "Neutral Market Movement"
    else:
        sentiment_label = "Negative Market Movement"

    return f"{sentiment_label}"

In [None]:
# Create Gradio interface
# Updated to use gr.Textbox directly
news_input = gr.Textbox(lines=2, placeholder="Enter News Title")
output_text = gr.Textbox()

interface = gr.Interface(fn=predict_market_movement,
                         inputs=news_input,
                         outputs=output_text,
                         title="Market Movement Predictor")

# Launch the interface
interface.launch()