In [1]:
# Import pandas to read in data
import pandas as pd

In [2]:
# Read 'X.csv' into a DataFrame: df1 and 'y.csv' into a DataFrame: df2
df1 = pd.read_csv('X.csv')
df2 = pd.read_csv('y.csv')

# Concatenate df1 and df2 horizontally: df1_train
df1_train = pd.concat([df1, df2], ignore_index=True)

print(df1_train.head())

                                text sentiment
0  id have responded if i were going   neutral
1                            soo sad  negative
2                        bullying me  negative
3                     leave me alone  negative
4                            sons of  negative


In [3]:
# Import tfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

In [4]:
# Fill NaN values in 'text' column with empty string
df1_train['text'] = df1_train['text'].fillna('')

# Transform the training data column 'text': tfidf_matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(df1_train['text'])

# Print the shape of tfidf_matrix
print(tfidf_matrix.shape)

(31014, 19732)


In [5]:
# Get the words: words
words = tfidf_vectorizer.get_feature_names_out()

# Print words
print(words.shape)

(19732,)


In [6]:
# Convert TF-IDF matrix to a DataFrame
df2_train = pd.DataFrame(tfidf_matrix.toarray(), columns=words)

# Print the head of df2_train
print(df2_train.head())

    aa  aac  aaggess  aaggh  aah  aahaha  aahh  aam  aamazing  aandnothin  \
0  0.0  0.0      0.0    0.0  0.0     0.0   0.0  0.0       0.0         0.0   
1  0.0  0.0      0.0    0.0  0.0     0.0   0.0  0.0       0.0         0.0   
2  0.0  0.0      0.0    0.0  0.0     0.0   0.0  0.0       0.0         0.0   
3  0.0  0.0      0.0    0.0  0.0     0.0   0.0  0.0       0.0         0.0   
4  0.0  0.0      0.0    0.0  0.0     0.0   0.0  0.0       0.0         0.0   

   ...  â½t  â½tition  â½ureo  â½ve  â½we  â½why  â½whyy  â½y  â½you  â½ã  
0  ...  0.0       0.0     0.0   0.0   0.0    0.0     0.0  0.0    0.0  0.0  
1  ...  0.0       0.0     0.0   0.0   0.0    0.0     0.0  0.0    0.0  0.0  
2  ...  0.0       0.0     0.0   0.0   0.0    0.0     0.0  0.0    0.0  0.0  
3  ...  0.0       0.0     0.0   0.0   0.0    0.0     0.0  0.0    0.0  0.0  
4  ...  0.0       0.0     0.0   0.0   0.0    0.0     0.0  0.0    0.0  0.0  

[5 rows x 19732 columns]


In [27]:
df2_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31014 entries, 0 to 31013
Columns: 19732 entries, aa to â½ã
dtypes: float64(19732)
memory usage: 4.6 GB


In [7]:
# Use 'label_encoder' to convert sentiment labels to numbers
from sklearn.preprocessing import LabelEncoder

# Initialize a LabelEncoder object: label_encoder
label_encoder = LabelEncoder()

# Transform the sentiment labels of df1_train: y
y = label_encoder.fit_transform(df1_train['sentiment'])

In [8]:
# Import train_test_split for splitting the data
from sklearn.model_selection import train_test_split

# Split the data into 70% training and 30% test data
X_train, X_test, y_train, y_test = train_test_split(df2_train, y, test_size=0.3, random_state=42)

# Print the shapes of X_train, X_test, and y_train and y_test
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(21709, 19732)
(9305, 19732)
(21709,)
(9305,)


In [30]:
# Import the necessary modules for the model logreg
from sklearn.linear_model import LogisticRegression

# Initialize a Logistic Regression model: logreg
logreg = LogisticRegression()

In [31]:
# Fit the model to the training data
logreg.fit(X_train, y_train)

In [32]:
# Predict the labels of the test set: y_pred
y_pred = logreg.predict(X_test)

In [34]:
# Import the necessary modules for the model accuracy, precision, recall, and f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Compute and print the accuracy
print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))

# Compute and print the precision
print("Precision: {}".format(precision_score(y_test, y_pred, average='weighted')))

# Compute and print the recall
print("Recall: {}".format(recall_score(y_test, y_pred, average='weighted')))

# Compute and print the f1_score
print("F1 Score: {}".format(f1_score(y_test, y_pred, average='weighted')))

Accuracy: 0.7773240193444385
Precision: 0.792865360453448
Recall: 0.7773240193444385
F1 Score: 0.7765788915242057


In [None]:
# Import the necessary modules for the svm model
from sklearn.svm import SVC

# Initialize a Support Vector Classifier model: svm
svm = SVC()

# Fit the model to the training data
svm.fit(X_train, y_train)