In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
df = pd.read_csv('training.1600000.processed.noemoticon.csv', 
                 encoding='latin-1', 
                 names=columns)

print("Dataset Boyutu:", df.shape)
print("\nİlk 5 satır:")
print(df.head())

print("\nSentiment Dağılımı:")
print(df['sentiment'].value_counts())

In [None]:
df_sample=pd.concat([
    df[df["sentiment"]==0].sample(200000,random_state=42),
    df[df["sentiment"]==4].sample(200000,random_state=42)
]).sample(frac=1,random_state=42).reset_index(drop=True)

df_sample["sentiment"]=df_sample["sentiment"].map({0:0,4:1})
df_sample=df_sample[["text","sentiment"]]

print("Sample Dataset:")
print(df_sample.head(10))
print("\nYeni Sentiment Dağılımı:")
print(df_sample['sentiment'].value_counts())
print("\nÖrnek Pozitif Tweetler:")
print(df_sample[df_sample['sentiment'] == 1]['text'].head(3).values)
print("\nÖrnek Negatif Tweetler:")
print(df_sample[df_sample['sentiment'] == 0]['text'].head(3).values)


In [None]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Mention'ları temizle (@username)
    text = re.sub(r'@\w+', '', text)
    
    # 3. URL'leri temizle
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # 4. Noktalama işaretlerini temizle (sadece harf ve boşluk kalsın)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 5. Fazla boşlukları temizle
    text = re.sub(r'\s+', ' ', text).strip()

    #6. Stopwords ve kısa kelimeleri temizle
    tokens=text.split()
    tokens=[word for word in tokens if word not in stop_words and len(word)>2]

    return ' '.join(tokens)


print("ÖNCE vs SONRA Karşılaştırması:\n")
for i in range(5):
    original = df_sample.iloc[i]['text']
    cleaned = clean_text(original)
    sentiment = "POZİTİF" if df_sample.iloc[i]['sentiment'] == 1 else "NEGATİF"
    
    print(f"{sentiment}")
    print(f"ÖNCE:  {original}")
    print(f"SONRA: {cleaned}")
    print("-" * 80)

In [None]:
print("Veri Temizlemesi")

start_time = time.time()
df_sample["cleaned_text"] = df_sample["text"].apply(clean_text)
end_time = time.time()
print(f"Veri temizleme süresi: {end_time - start_time:.2f} saniye")

print(f"\n Boş tweet sayısı: {(df_sample['cleaned_text'] == '').sum()}")
df_sample = df_sample[df_sample['cleaned_text'] != ''].reset_index(drop=True)
print(f"\n Kalan tweet sayısı: {len(df_sample)}")

print("\n Temizlenmiş Veri Örneği:")
print(df_sample[['cleaned_text', 'sentiment']].head(10))

df_sample['word_count'] = df_sample['cleaned_text'].apply(lambda x: len(x.split()))
print("\n Kelime Sayısı İstatistikleri:")
print(df_sample['word_count'].describe())

In [None]:
positive_text=" ".join(df_sample[df_sample['sentiment']==1]['cleaned_text'])
negative_text=" ".join(df_sample[df_sample["sentiment"]==0]["cleaned_text"])

def plot_wordcloud(text,title):
    wordcloud=WordCloud(width=800,height=400,random_state=21,max_words=100,background_color='white').generate(text)
    plt.figure(figsize=(10,7))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.title(title)
    plt.show()

print("Pozitif Tweetlerde En Sık Geçen Kelimeler:")
plot_wordcloud(positive_text, "Pozitif Kelime Bulutu")

print("Negatif Tweetlerde En Sık Geçen Kelimeler:")
plot_wordcloud(negative_text, "Negatif Kelime Bulutu")

In [None]:
X=df_sample["cleaned_text"]
y=df_sample["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      
    random_state=42,    
    stratify=y          
)
print("Veri Bölünümü")
print(f"Training set: {len(X_train)} tweet")
print(f"Test set: {len(X_test)} tweet")
print(f"\n Training set sentiment dağılımı:")
print(y_train.value_counts())
print(f"\n Test set sentiment dağılımı:")
print(y_test.value_counts())

In [None]:
tfidf = TfidfVectorizer(
    max_features=15000,      
    ngram_range=(1, 3),     
    min_df=2,                
    max_df=0.95,             
    sublinear_tf=True,       
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    use_idf=True,
    smooth_idf=True
)

X_train_tfidf=tfidf.fit_transform(X_train)
X_test_tfidf=tfidf.transform(X_test)

print(f"Training: {X_train_tfidf.shape}")
print(f"Test: {X_test_tfidf.shape}")
print(f"\n Toplam kelime sayısı: {len(tfidf.get_feature_names_out())}")

feature_names = tfidf.get_feature_names_out()
print(f"\n İlk 20 kelime:")
print(feature_names[:20])
print(f"\n Son 20 kelime:")
print(feature_names[-20:])

In [None]:
lr_model = LogisticRegression(
    C=2.0,                   
    max_iter=1000,
    solver='liblinear',      
    random_state=42
)
lr_model.fit(X_train_tfidf, y_train)
lr_pred = lr_model.predict(X_test_tfidf)
lr_accuracy = accuracy_score(y_test, lr_pred)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f} ({lr_accuracy*100:.2f}%)\n")

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)
nb_accuracy = accuracy_score(y_test, nb_pred)
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f} ({nb_accuracy*100:.2f}%)\n")

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_tfidf, y_train)
rf_pred = rf_model.predict(X_test_tfidf)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Random Forest Accuracy: {rf_accuracy:.4f} ({rf_accuracy*100:.2f}%)\n")


print("=" * 60)
print(" Modellerin Karşılaştırılması:")
print("=" * 60)
results = [
    ("Logistic Regression", lr_accuracy),
    ("Naive Bayes", nb_accuracy),
    ("Random Forest", rf_accuracy)
]

for model_name, acc in sorted(results, key=lambda x: x[1], reverse=True):
    print(f"{model_name:.<30} {acc*100:.2f}%")

print("\n En İyi Modelimiz:", max(results, key=lambda x: x[1])[0])

In [None]:
print(classification_report(y_test, best_pred, 
                          target_names=['Negatif', 'Pozitif']))

In [None]:
best_model=lr_model
best_pred=lr_pred
best_acc=lr_accuracy

cm = confusion_matrix(y_test, best_pred)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negatif','Pozitif'],
            yticklabels=['Negatif','Pozitif'],
            cbar_kws={'label':'Tweet Sayısı'},
            annot_kws={'size':16})
plt.title(f"Confusion Matrix - Logistic Regression\nAccuracy: {best_acc*100:.2f}%", 
          fontsize=16, fontweight='bold')
plt.ylabel("Gerçek Sentiment")
plt.xlabel("Tahmin Edilen Sentiment")
plt.tight_layout()
plt.show()

In [None]:
models = ["Logistic Regression", "Naive Bayes", "Random Forest"]
accuracies = [lr_accuracy*100, nb_accuracy*100, rf_accuracy*100]
colors = ["blue", "green", "red"]

plt.figure(figsize=(12,7))
bars = plt.bar(models, accuracies, color=colors, edgecolor="black", linewidth=1.5)

for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 1, 
             f'{acc:.2f}%', ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.axhline(y=50, color='red', linestyle='--', linewidth=2, alpha=0.5, 
            label='Rastgele Tahmin (50%)')
plt.ylim(0, 100)
plt.ylabel('Accuracy (%)', fontsize=14, fontweight='bold')
plt.xlabel('Modeller', fontsize=14, fontweight='bold')
plt.title('Model Performans Karşılaştırması', fontsize=16, fontweight='bold', pad=20)
plt.legend(fontsize=11)
plt.grid(axis='y', alpha=0.3, linestyle='--')
plt.tight_layout()
plt.show()

In [None]:
def tahmin_et(text):
    cleaned = clean_text(text)
    
    vectorized = tfidf.transform([cleaned])
    
    prediction = best_model.predict(vectorized)[0]
    proba = best_model.predict_proba(vectorized)[0] 
    
    sentiment = "POZİTİF " if prediction == 1 else "NEGATİF "
    confidence = proba[prediction]
    print(f"Confidence: {confidence*100:.2f}%")
    
    print(f"Tweet: '{text}'")

    print(f"Modelin Gördüğü: '{cleaned}'") 
    print(f"Tahmin: {sentiment}")
    print("-" * 40)


# Pozitif
tahmin_et("I love this product, it is amazing and perfect!")

#Negatif
tahmin_et("This is the worst movie I have ever seen. Terrible acting.")

#Karışık
tahmin_et("I am so happy that I bought this.")
tahmin_et("My flight was delayed and my luggage is lost.")


while True:
    kullanici_girisi = input("\nBir İngilizce cümle yazın: ")
    
    if kullanici_girisi.lower() == 'q':
        print("Programdan çıkılıyor... ")
        break

    tahmin_et(kullanici_girisi)
