Skip to content

Commit

Permalink
initial step towards implementing C string literals
Browse files Browse the repository at this point in the history
  • Loading branch information
fee1-dead committed May 2, 2023
1 parent 7b99493 commit 8ff3903
Show file tree
Hide file tree
Showing 17 changed files with 310 additions and 80 deletions.
3 changes: 3 additions & 0 deletions compiler/rustc_ast/src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1814,6 +1814,8 @@ pub enum LitKind {
/// A byte string (`b"foo"`). Not stored as a symbol because it might be
/// non-utf8, and symbols only allow utf8 strings.
ByteStr(Lrc<[u8]>, StrStyle),
/// A C String (`c"foo"`).
CStr(Lrc<[u8]>, StrStyle),
/// A byte char (`b'f'`).
Byte(u8),
/// A character literal (`'a'`).
Expand Down Expand Up @@ -1868,6 +1870,7 @@ impl LitKind {
// unsuffixed variants
LitKind::Str(..)
| LitKind::ByteStr(..)
| LitKind::CStr(..)
| LitKind::Byte(..)
| LitKind::Char(..)
| LitKind::Int(_, LitIntType::Unsuffixed)
Expand Down
7 changes: 7 additions & 0 deletions compiler/rustc_ast/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ pub enum LitKind {
StrRaw(u8), // raw string delimited by `n` hash symbols
ByteStr,
ByteStrRaw(u8), // raw byte string delimited by `n` hash symbols
CStr,
CStrRaw(u8),
Err,
}

Expand Down Expand Up @@ -141,6 +143,10 @@ impl fmt::Display for Lit {
delim = "#".repeat(n as usize),
string = symbol
)?,
CStr => write!(f, "c\"{symbol}\"")?,
CStrRaw(n) => {
write!(f, "cr{delim}\"{symbol}\"{delim}", delim = "#".repeat(n as usize))?
}
Integer | Float | Bool | Err => write!(f, "{symbol}")?,
}

Expand Down Expand Up @@ -170,6 +176,7 @@ impl LitKind {
Float => "float",
Str | StrRaw(..) => "string",
ByteStr | ByteStrRaw(..) => "byte string",
CStr | CStrRaw(..) => "C string",
Err => "error",
}
}
Expand Down
55 changes: 54 additions & 1 deletion compiler/rustc_ast/src/util/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
use crate::token::{self, Token};
use rustc_lexer::unescape::{byte_from_char, unescape_byte, unescape_char, unescape_literal, Mode};
use rustc_lexer::unescape::{
byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit,
Mode,
};
use rustc_span::symbol::{kw, sym, Symbol};
use rustc_span::Span;
use std::{ascii, fmt, str};
Expand Down Expand Up @@ -158,6 +161,52 @@ impl LitKind {

LitKind::ByteStr(bytes.into(), StrStyle::Raw(n))
}
token::CStr => {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_c_string(s, Mode::CStr, &mut |span, c| match c {
Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => {
error = Err(LitError::NulInCStr(span));
}
Ok(CStrUnit::Byte(b)) => buf.push(b),
Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
Ok(CStrUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
}
});
error?;
buf.push(b'\0');
LitKind::CStr(buf.into(), StrStyle::Cooked)
}
token::CStrRaw(n) => {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_c_string(s, Mode::RawCStr, &mut |span, c| match c {
Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => {
error = Err(LitError::NulInCStr(span));
}
Ok(CStrUnit::Byte(b)) => buf.push(b),
Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
Ok(CStrUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
}
});
error?;
buf.push(b'\0');
LitKind::CStr(buf.into(), StrStyle::Raw(n))
}
token::Err => LitKind::Err,
})
}
Expand Down Expand Up @@ -191,6 +240,8 @@ impl fmt::Display for LitKind {
string = symbol
)?;
}
// TODO need to reescape
LitKind::CStr(..) => todo!(),
LitKind::Int(n, ty) => {
write!(f, "{n}")?;
match ty {
Expand Down Expand Up @@ -237,6 +288,8 @@ impl MetaItemLit {
LitKind::Str(_, ast::StrStyle::Raw(n)) => token::StrRaw(n),
LitKind::ByteStr(_, ast::StrStyle::Cooked) => token::ByteStr,
LitKind::ByteStr(_, ast::StrStyle::Raw(n)) => token::ByteStrRaw(n),
LitKind::CStr(_, ast::StrStyle::Cooked) => token::CStr,
LitKind::CStr(_, ast::StrStyle::Raw(n)) => token::CStrRaw(n),
LitKind::Byte(_) => token::Byte,
LitKind::Char(_) => token::Char,
LitKind::Int(..) => token::Integer,
Expand Down
2 changes: 2 additions & 0 deletions compiler/rustc_ast_pretty/src/pprust/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ pub fn literal_to_string(lit: token::Lit) -> String {
token::ByteStrRaw(n) => {
format!("br{delim}\"{string}\"{delim}", delim = "#".repeat(n as usize), string = symbol)
}
// TODO
token::CStr | token::CStrRaw(_) => todo!(),
token::Integer | token::Float | token::Bool | token::Err => symbol.to_string(),
};

Expand Down
4 changes: 4 additions & 0 deletions compiler/rustc_builtin_macros/src/concat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ pub fn expand_concat(
Ok(ast::LitKind::Bool(b)) => {
accumulator.push_str(&b.to_string());
}
Ok(ast::LitKind::CStr(..)) => {
cx.span_err(e.span, "cannot concatenate a C string literal");
has_errors = true;
}
Ok(ast::LitKind::Byte(..) | ast::LitKind::ByteStr(..)) => {
cx.emit_err(errors::ConcatBytestr { span: e.span });
has_errors = true;
Expand Down
4 changes: 4 additions & 0 deletions compiler/rustc_builtin_macros/src/concat_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ fn invalid_type_err(
};
let snippet = cx.sess.source_map().span_to_snippet(span).ok();
match ast::LitKind::from_token_lit(token_lit) {
Ok(ast::LitKind::CStr(_, _)) => {
// TODO
cx.span_err(span, "cannot concatenate C string litearls");
}
Ok(ast::LitKind::Char(_)) => {
let sugg =
snippet.map(|snippet| ConcatBytesInvalidSuggestion::CharLit { span, snippet });
Expand Down
4 changes: 4 additions & 0 deletions compiler/rustc_expand/src/proc_macro_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ impl FromInternal<token::LitKind> for LitKind {
token::StrRaw(n) => LitKind::StrRaw(n),
token::ByteStr => LitKind::ByteStr,
token::ByteStrRaw(n) => LitKind::ByteStrRaw(n),
// TODO
token::CStr | token::CStrRaw(_) => todo!(),
token::Err => LitKind::Err,
token::Bool => unreachable!(),
}
Expand Down Expand Up @@ -436,6 +438,8 @@ impl server::FreeFunctions for Rustc<'_, '_> {
| token::LitKind::StrRaw(_)
| token::LitKind::ByteStr
| token::LitKind::ByteStrRaw(_)
| token::LitKind::CStr
| token::LitKind::CStrRaw(_)
| token::LitKind::Err => return Err(()),
token::LitKind::Integer | token::LitKind::Float => {}
}
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_hir/src/lang_items.rs
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ language_item_table! {
RangeTo, sym::RangeTo, range_to_struct, Target::Struct, GenericRequirement::None;

String, sym::String, string, Target::Struct, GenericRequirement::None;
CStr, sym::CStr, c_str, Target::Struct, GenericRequirement::None;
}

pub enum GenericRequirement {
Expand Down
5 changes: 5 additions & 0 deletions compiler/rustc_hir_typeck/src/fn_ctxt/checks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1300,6 +1300,11 @@ impl<'a, 'tcx> FnCtxt<'a, 'tcx> {
opt_ty.unwrap_or_else(|| self.next_float_var())
}
ast::LitKind::Bool(_) => tcx.types.bool,
ast::LitKind::CStr(_, _) => tcx.mk_imm_ref(
tcx.lifetimes.re_static,
tcx.type_of(tcx.require_lang_item(hir::LangItem::CStr, Some(lit.span)))
.skip_binder(),
),
ast::LitKind::Err => tcx.ty_error_misc(),
}
}
Expand Down
30 changes: 30 additions & 0 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,16 @@ pub enum LiteralKind {
Str { terminated: bool },
/// "b"abc"", "b"abc"
ByteStr { terminated: bool },
/// `c"abc"`, `c"abc`
CStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
/// an invalid literal.
RawStr { n_hashes: Option<u8> },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
/// indicates an invalid literal.
RawByteStr { n_hashes: Option<u8> },
/// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` is invalid.
RawCStr { n_hashes: Option<u8> },
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
Expand Down Expand Up @@ -391,6 +395,32 @@ impl Cursor<'_> {
_ => self.ident_or_unknown_prefix(),
},

// TODO deduplicate this code
// c-string literal, raw c-string literal or identifier.
'c' => match (self.first(), self.second()) {
('"', _) => {
self.bump();
let terminated = self.double_quoted_string();
let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
let kind = CStr { terminated };
Literal { kind, suffix_start }
}
('r', '"') | ('r', '#') => {
self.bump();
let res = self.raw_double_quoted_string(2);
let suffix_start = self.pos_within_token();
if res.is_ok() {
self.eat_literal_suffix();
}
let kind = RawCStr { n_hashes: res.ok() };
Literal { kind, suffix_start }
}
_ => self.ident_or_unknown_prefix(),
},

// Identifier (this should be checked after other variant that can
// start as identifier).
c if is_id_start(c) => self.ident_or_unknown_prefix(),
Expand Down

0 comments on commit 8ff3903

Please sign in to comment.