## PROJ 2 NLP

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.metrics import accuracy_score


# Create a pipeline with TfidfVectorizer and LogisticRegression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

In [16]:


# 1) pathlib (recomendado)
from pathlib import Path
Path.cwd()              # diretório atual (Path)

# 2) os
import os
DIR = os.getcwd()             # diretório atual (str)

FAKE_CSV = os.path.join(DIR, 'samples', 'train_Fake.csv')
TRUE_CSV = os.path.join(DIR, 'samples', 'train_True.csv')
TEST_PATH = os.path.join(DIR, 'samples', 'Test_Real_Fake.csv')



# Ler os dois arquivos parquet
fake_df = pd.read_csv(FAKE_CSV)
true_df = pd.read_csv(TRUE_CSV)
different_db = pd.read_csv(TEST_PATH)



fake_df["label"] = 0 
true_df["label"] = 1

In [17]:

#ajustando os datasets de treino
true_df = true_df.drop(columns=["title", "subject", "date"])
fake_df = fake_df.drop(columns=["title", "subject", "date"])

# Separando cada dataframe na metade
true_half = len(true_df) // 2
fake_half = len(fake_df) // 2

true_train = true_df.iloc[:true_half].copy()
true_test = true_df.iloc[true_half:].copy()
fake_train = fake_df.iloc[:fake_half].copy()
fake_test = fake_df.iloc[fake_half:].copy()

# Concatenando para formar treino e teste
train = pd.concat([true_train, fake_train]).reset_index(drop=True)
test = pd.concat([true_test, fake_test]).reset_index(drop=True)


display(train.head())
display(test.head())


X_train = list(train['text'])
y_train = list(train['label'])
x_test = list(test['text'])
y_test = list(test['label'])


# Ajustando os datasers de teste
different_db = different_db[["text", "label"]].copy()
different_db['label'] = (
    different_db['label']
      .replace({'FAKE': 0, 'REAL': 1})
      .astype('Int8')  # mantém nulos
)
display(different_db.head())

X_different = list(different_db['text'])
y_different = list(different_db['label'])

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


Unnamed: 0,text,label
0,GENEVA (Reuters) - The top U.N. human rights o...,1
1,WASHINGTON (Reuters) - Republican presidential...,1
2,WASHINGTON (Reuters) - Republican presidential...,1
3,WASHINGTON (Reuters) - U.S. Secretary of State...,1
4,WASHINGTON (Reuters) - The chairman of the Sen...,1


  .replace({'FAKE': 0, 'REAL': 1})


Unnamed: 0,text,label
0,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,U.S. Secretary of State John F. Kerry said Mon...,1
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,It's primary day in New York and front-runners...,1


### classifying fake news with non fake news

In [18]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(x_test)
acc_yelp = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc_yelp:.4f}")

Accuracy: 0.9531


In [19]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_different)
acc_yelp = accuracy_score(y_different, y_pred)
print(f"Accuracy: {acc_yelp:.4f}")

# 2) Extrair vectorizer e classificador
vectorizer = pipeline.named_steps["tfidf"]
clf = pipeline.named_steps["clf"]

# 3) Pegar nomes das features (as palavras)
feature_names = np.array(vectorizer.get_feature_names_out())

# 4) Tratar cada palavra como "documento" e prever probabilidades
probabilities = pipeline.predict_proba(feature_names)

# 5) DataFrame com probabilidades
proba_df = pd.DataFrame(probabilities, index=feature_names, columns=clf.classes_)

# 6) (Opcional) renomear colunas se classes forem {0,1}
if set(clf.classes_) == {0, 1}:
    proba_df = proba_df.rename(columns={0: "negative", 1: "positive"})

# 7) Ordenar por probabilidade de "positivo" (se existir), senão pela segunda classe
sort_col = "positive" if "positive" in proba_df.columns else (clf.classes_[1] if len(clf.classes_) > 1 else clf.classes_[0])
proba_df_sorted = proba_df.sort_values(by=sort_col, ascending=False)

# 8) Top 10 palavras mais "positivas"
print("Top 10 words predicted as POSITIVE (or highest-prob class):")
print(proba_df_sorted.head(10))

# 9) Top 10 palavras mais "negativas"
neg_col = "negative" if "negative" in proba_df.columns else clf.classes_[0]
print("Top 10 words predicted as NEGATIVE (or lowest-prob class):")
print(proba_df.sort_values(by=neg_col, ascending=False).head(10))

Accuracy: 0.6439
Top 10 words predicted as POSITIVE (or highest-prob class):
                  negative  positive
said          1.046368e-07  1.000000
reuters       2.002175e-07  1.000000
washington    1.216080e-03  0.998784
on            1.276176e-03  0.998724
republican    2.652257e-02  0.973477
in            3.506812e-02  0.964932
wednesday     4.720373e-02  0.952796
presidential  5.422473e-02  0.945775
tuesday       5.770043e-02  0.942300
thursday      9.729149e-02  0.902709
Top 10 words predicted as NEGATIVE (or lowest-prob class):
          negative  positive
via       0.999728  0.000272
image     0.999466  0.000534
featured  0.999337  0.000663
is        0.998481  0.001519
that      0.998104  0.001896
this      0.997881  0.002119
just      0.996988  0.003012
getty     0.996948  0.003052
images    0.995737  0.004263
com       0.995364  0.004636
